1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
4 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
5 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
6 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
8 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
10 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
11 ; GCN-LABEL: test_load_store:
13 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; GCN-NEXT: s_mov_b32 s6, 0
15 ; GCN-NEXT: s_mov_b32 s7, 0xf000
16 ; GCN-NEXT: s_mov_b32 s4, s6
17 ; GCN-NEXT: s_mov_b32 s5, s6
18 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
19 ; GCN-NEXT: s_waitcnt vmcnt(0)
20 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
21 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
22 ; GCN-NEXT: s_setpc_b64 s[30:31]
24 ; GFX7-LABEL: test_load_store:
26 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX7-NEXT: s_mov_b32 s6, 0
28 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
29 ; GFX7-NEXT: s_mov_b32 s4, s6
30 ; GFX7-NEXT: s_mov_b32 s5, s6
31 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
32 ; GFX7-NEXT: s_waitcnt vmcnt(0)
33 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
34 ; GFX7-NEXT: s_waitcnt vmcnt(0)
35 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37 ; GFX8-LABEL: test_load_store:
39 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
41 ; GFX8-NEXT: s_waitcnt vmcnt(0)
42 ; GFX8-NEXT: flat_store_short v[2:3], v0
43 ; GFX8-NEXT: s_waitcnt vmcnt(0)
44 ; GFX8-NEXT: s_setpc_b64 s[30:31]
46 ; GFX9-LABEL: test_load_store:
48 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
50 ; GFX9-NEXT: s_waitcnt vmcnt(0)
51 ; GFX9-NEXT: global_store_short v[2:3], v0, off
52 ; GFX9-NEXT: s_waitcnt vmcnt(0)
53 ; GFX9-NEXT: s_setpc_b64 s[30:31]
55 ; GFX10-LABEL: test_load_store:
57 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: global_store_short v[2:3], v0, off
61 ; GFX10-NEXT: s_setpc_b64 s[30:31]
63 ; GFX11-LABEL: test_load_store:
65 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
67 ; GFX11-NEXT: s_waitcnt vmcnt(0)
68 ; GFX11-NEXT: global_store_b16 v[2:3], v0, off
69 ; GFX11-NEXT: s_setpc_b64 s[30:31]
70 %val = load bfloat, ptr addrspace(1) %in
71 store bfloat %val, ptr addrspace(1) %out
75 define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
76 ; GCN-LABEL: v_load_global_v2bf16:
78 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79 ; GCN-NEXT: s_mov_b32 s6, 0
80 ; GCN-NEXT: s_mov_b32 s7, 0xf000
81 ; GCN-NEXT: s_mov_b32 s4, s6
82 ; GCN-NEXT: s_mov_b32 s5, s6
83 ; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
84 ; GCN-NEXT: s_waitcnt vmcnt(0)
85 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
86 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
87 ; GCN-NEXT: s_setpc_b64 s[30:31]
89 ; GFX7-LABEL: v_load_global_v2bf16:
91 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX7-NEXT: s_mov_b32 s6, 0
93 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
94 ; GFX7-NEXT: s_mov_b32 s4, s6
95 ; GFX7-NEXT: s_mov_b32 s5, s6
96 ; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
97 ; GFX7-NEXT: s_waitcnt vmcnt(0)
98 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
99 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
100 ; GFX7-NEXT: s_setpc_b64 s[30:31]
102 ; GFX8-LABEL: v_load_global_v2bf16:
104 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
106 ; GFX8-NEXT: s_waitcnt vmcnt(0)
107 ; GFX8-NEXT: s_setpc_b64 s[30:31]
109 ; GFX9-LABEL: v_load_global_v2bf16:
111 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
113 ; GFX9-NEXT: s_waitcnt vmcnt(0)
114 ; GFX9-NEXT: s_setpc_b64 s[30:31]
116 ; GFX10-LABEL: v_load_global_v2bf16:
118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
121 ; GFX10-NEXT: s_setpc_b64 s[30:31]
123 ; GFX11-LABEL: v_load_global_v2bf16:
125 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
127 ; GFX11-NEXT: s_waitcnt vmcnt(0)
128 ; GFX11-NEXT: s_setpc_b64 s[30:31]
129 %load = load <2 x bfloat>, ptr addrspace(1) %ptr
130 ret <2 x bfloat> %load
133 define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
134 ; GCN-LABEL: v_load_global_v3bf16:
136 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GCN-NEXT: s_mov_b32 s6, 0
138 ; GCN-NEXT: s_mov_b32 s7, 0xf000
139 ; GCN-NEXT: s_mov_b32 s4, s6
140 ; GCN-NEXT: s_mov_b32 s5, s6
141 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
142 ; GCN-NEXT: s_waitcnt vmcnt(0)
143 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
144 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
145 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
146 ; GCN-NEXT: s_setpc_b64 s[30:31]
148 ; GFX7-LABEL: v_load_global_v3bf16:
150 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151 ; GFX7-NEXT: s_mov_b32 s6, 0
152 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
153 ; GFX7-NEXT: s_mov_b32 s4, s6
154 ; GFX7-NEXT: s_mov_b32 s5, s6
155 ; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
156 ; GFX7-NEXT: s_waitcnt vmcnt(0)
157 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
158 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
159 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
160 ; GFX7-NEXT: s_setpc_b64 s[30:31]
162 ; GFX8-LABEL: v_load_global_v3bf16:
164 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
166 ; GFX8-NEXT: s_waitcnt vmcnt(0)
167 ; GFX8-NEXT: s_setpc_b64 s[30:31]
169 ; GFX9-LABEL: v_load_global_v3bf16:
171 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
173 ; GFX9-NEXT: s_waitcnt vmcnt(0)
174 ; GFX9-NEXT: s_setpc_b64 s[30:31]
176 ; GFX10-LABEL: v_load_global_v3bf16:
178 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
180 ; GFX10-NEXT: s_waitcnt vmcnt(0)
181 ; GFX10-NEXT: s_setpc_b64 s[30:31]
183 ; GFX11-LABEL: v_load_global_v3bf16:
185 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
187 ; GFX11-NEXT: s_waitcnt vmcnt(0)
188 ; GFX11-NEXT: s_setpc_b64 s[30:31]
189 %load = load <3 x bfloat>, ptr addrspace(1) %ptr
190 ret <3 x bfloat> %load
193 define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
194 ; GCN-LABEL: v_load_global_v4bf16:
196 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197 ; GCN-NEXT: s_mov_b32 s6, 0
198 ; GCN-NEXT: s_mov_b32 s7, 0xf000
199 ; GCN-NEXT: s_mov_b32 s4, s6
200 ; GCN-NEXT: s_mov_b32 s5, s6
201 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
202 ; GCN-NEXT: s_waitcnt vmcnt(0)
203 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
204 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
205 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
206 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
207 ; GCN-NEXT: s_setpc_b64 s[30:31]
209 ; GFX7-LABEL: v_load_global_v4bf16:
211 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX7-NEXT: s_mov_b32 s6, 0
213 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
214 ; GFX7-NEXT: s_mov_b32 s4, s6
215 ; GFX7-NEXT: s_mov_b32 s5, s6
216 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
217 ; GFX7-NEXT: s_waitcnt vmcnt(0)
218 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
219 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
220 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
221 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
222 ; GFX7-NEXT: s_setpc_b64 s[30:31]
224 ; GFX8-LABEL: v_load_global_v4bf16:
226 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
228 ; GFX8-NEXT: s_waitcnt vmcnt(0)
229 ; GFX8-NEXT: s_setpc_b64 s[30:31]
231 ; GFX9-LABEL: v_load_global_v4bf16:
233 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
235 ; GFX9-NEXT: s_waitcnt vmcnt(0)
236 ; GFX9-NEXT: s_setpc_b64 s[30:31]
238 ; GFX10-LABEL: v_load_global_v4bf16:
240 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
242 ; GFX10-NEXT: s_waitcnt vmcnt(0)
243 ; GFX10-NEXT: s_setpc_b64 s[30:31]
245 ; GFX11-LABEL: v_load_global_v4bf16:
247 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
249 ; GFX11-NEXT: s_waitcnt vmcnt(0)
250 ; GFX11-NEXT: s_setpc_b64 s[30:31]
251 %load = load <4 x bfloat>, ptr addrspace(1) %ptr
252 ret <4 x bfloat> %load
255 define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
256 ; GCN-LABEL: v_load_global_v6bf16:
258 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259 ; GCN-NEXT: s_mov_b32 s6, 0
260 ; GCN-NEXT: s_mov_b32 s7, 0xf000
261 ; GCN-NEXT: s_mov_b32 s4, s6
262 ; GCN-NEXT: s_mov_b32 s5, s6
263 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
264 ; GCN-NEXT: s_waitcnt vmcnt(0)
265 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
266 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
267 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
268 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
269 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
270 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
271 ; GCN-NEXT: s_setpc_b64 s[30:31]
273 ; GFX7-LABEL: v_load_global_v6bf16:
275 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276 ; GFX7-NEXT: s_mov_b32 s6, 0
277 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
278 ; GFX7-NEXT: s_mov_b32 s4, s6
279 ; GFX7-NEXT: s_mov_b32 s5, s6
280 ; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
281 ; GFX7-NEXT: s_waitcnt vmcnt(0)
282 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
283 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
284 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
285 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
286 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
287 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
288 ; GFX7-NEXT: s_setpc_b64 s[30:31]
290 ; GFX8-LABEL: v_load_global_v6bf16:
292 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
294 ; GFX8-NEXT: s_waitcnt vmcnt(0)
295 ; GFX8-NEXT: s_setpc_b64 s[30:31]
297 ; GFX9-LABEL: v_load_global_v6bf16:
299 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
301 ; GFX9-NEXT: s_waitcnt vmcnt(0)
302 ; GFX9-NEXT: s_setpc_b64 s[30:31]
304 ; GFX10-LABEL: v_load_global_v6bf16:
306 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
308 ; GFX10-NEXT: s_waitcnt vmcnt(0)
309 ; GFX10-NEXT: s_setpc_b64 s[30:31]
311 ; GFX11-LABEL: v_load_global_v6bf16:
313 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314 ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
315 ; GFX11-NEXT: s_waitcnt vmcnt(0)
316 ; GFX11-NEXT: s_setpc_b64 s[30:31]
317 %load = load <6 x bfloat>, ptr addrspace(1) %ptr
318 ret <6 x bfloat> %load
321 define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
322 ; GCN-LABEL: v_load_global_v8bf16:
324 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; GCN-NEXT: s_mov_b32 s6, 0
326 ; GCN-NEXT: s_mov_b32 s7, 0xf000
327 ; GCN-NEXT: s_mov_b32 s4, s6
328 ; GCN-NEXT: s_mov_b32 s5, s6
329 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
330 ; GCN-NEXT: s_waitcnt vmcnt(0)
331 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
332 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
333 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
334 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
335 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
336 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
337 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
338 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
339 ; GCN-NEXT: s_setpc_b64 s[30:31]
341 ; GFX7-LABEL: v_load_global_v8bf16:
343 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; GFX7-NEXT: s_mov_b32 s6, 0
345 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
346 ; GFX7-NEXT: s_mov_b32 s4, s6
347 ; GFX7-NEXT: s_mov_b32 s5, s6
348 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
349 ; GFX7-NEXT: s_waitcnt vmcnt(0)
350 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
351 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
352 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
353 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
354 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
355 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
356 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
357 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
358 ; GFX7-NEXT: s_setpc_b64 s[30:31]
360 ; GFX8-LABEL: v_load_global_v8bf16:
362 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
364 ; GFX8-NEXT: s_waitcnt vmcnt(0)
365 ; GFX8-NEXT: s_setpc_b64 s[30:31]
367 ; GFX9-LABEL: v_load_global_v8bf16:
369 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
371 ; GFX9-NEXT: s_waitcnt vmcnt(0)
372 ; GFX9-NEXT: s_setpc_b64 s[30:31]
374 ; GFX10-LABEL: v_load_global_v8bf16:
376 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
378 ; GFX10-NEXT: s_waitcnt vmcnt(0)
379 ; GFX10-NEXT: s_setpc_b64 s[30:31]
381 ; GFX11-LABEL: v_load_global_v8bf16:
383 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
385 ; GFX11-NEXT: s_waitcnt vmcnt(0)
386 ; GFX11-NEXT: s_setpc_b64 s[30:31]
387 %load = load <8 x bfloat>, ptr addrspace(1) %ptr
388 ret <8 x bfloat> %load
391 define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
392 ; GCN-LABEL: v_load_global_v16bf16:
394 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395 ; GCN-NEXT: s_mov_b32 s6, 0
396 ; GCN-NEXT: s_mov_b32 s7, 0xf000
397 ; GCN-NEXT: s_mov_b32 s4, s6
398 ; GCN-NEXT: s_mov_b32 s5, s6
399 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
400 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
401 ; GCN-NEXT: s_waitcnt vmcnt(1)
402 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
403 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
404 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
405 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
406 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
407 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
408 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
409 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
410 ; GCN-NEXT: s_waitcnt vmcnt(0)
411 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
412 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
413 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
414 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
415 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
416 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
417 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
418 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
419 ; GCN-NEXT: s_setpc_b64 s[30:31]
421 ; GFX7-LABEL: v_load_global_v16bf16:
423 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; GFX7-NEXT: s_mov_b32 s6, 0
425 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
426 ; GFX7-NEXT: s_mov_b32 s4, s6
427 ; GFX7-NEXT: s_mov_b32 s5, s6
428 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
429 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
430 ; GFX7-NEXT: s_waitcnt vmcnt(1)
431 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
432 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
433 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
434 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
435 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
436 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
437 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
438 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
439 ; GFX7-NEXT: s_waitcnt vmcnt(0)
440 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
441 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
442 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
443 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
444 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
445 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
446 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
447 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
448 ; GFX7-NEXT: s_setpc_b64 s[30:31]
450 ; GFX8-LABEL: v_load_global_v16bf16:
452 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GFX8-NEXT: v_mov_b32_e32 v5, v1
454 ; GFX8-NEXT: v_mov_b32_e32 v4, v0
455 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
456 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4
457 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
458 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
459 ; GFX8-NEXT: s_waitcnt vmcnt(0)
460 ; GFX8-NEXT: s_setpc_b64 s[30:31]
462 ; GFX9-LABEL: v_load_global_v16bf16:
464 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465 ; GFX9-NEXT: v_mov_b32_e32 v9, v1
466 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
467 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
468 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: s_setpc_b64 s[30:31]
472 ; GFX10-LABEL: v_load_global_v16bf16:
474 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475 ; GFX10-NEXT: v_mov_b32_e32 v9, v1
476 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
477 ; GFX10-NEXT: s_clause 0x1
478 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
479 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
480 ; GFX10-NEXT: s_waitcnt vmcnt(0)
481 ; GFX10-NEXT: s_setpc_b64 s[30:31]
483 ; GFX11-LABEL: v_load_global_v16bf16:
485 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
487 ; GFX11-NEXT: s_clause 0x1
488 ; GFX11-NEXT: global_load_b128 v[0:3], v[4:5], off
489 ; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
490 ; GFX11-NEXT: s_waitcnt vmcnt(0)
491 ; GFX11-NEXT: s_setpc_b64 s[30:31]
492 %load = load <16 x bfloat>, ptr addrspace(1) %ptr
493 ret <16 x bfloat> %load
496 define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
497 ; GCN-LABEL: v_load_global_v32bf16:
499 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500 ; GCN-NEXT: s_mov_b32 s6, 0
501 ; GCN-NEXT: s_mov_b32 s7, 0xf000
502 ; GCN-NEXT: s_mov_b32 s4, s6
503 ; GCN-NEXT: s_mov_b32 s5, s6
504 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
505 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
506 ; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
507 ; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
508 ; GCN-NEXT: s_waitcnt vmcnt(3)
509 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
510 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
511 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
512 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
513 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
514 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
515 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
516 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
517 ; GCN-NEXT: s_waitcnt vmcnt(2)
518 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
519 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
520 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
521 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
522 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
523 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
524 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
525 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
526 ; GCN-NEXT: s_waitcnt vmcnt(1)
527 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
528 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
529 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
530 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
531 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
532 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
533 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
534 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
535 ; GCN-NEXT: s_waitcnt vmcnt(0)
536 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
537 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
538 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
539 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
540 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
541 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
542 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
543 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
544 ; GCN-NEXT: s_setpc_b64 s[30:31]
546 ; GFX7-LABEL: v_load_global_v32bf16:
548 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549 ; GFX7-NEXT: s_mov_b32 s6, 0
550 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
551 ; GFX7-NEXT: s_mov_b32 s4, s6
552 ; GFX7-NEXT: s_mov_b32 s5, s6
553 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
554 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
555 ; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
556 ; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
557 ; GFX7-NEXT: s_waitcnt vmcnt(3)
558 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
559 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
560 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
561 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
562 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
563 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
564 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
565 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
566 ; GFX7-NEXT: s_waitcnt vmcnt(2)
567 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
568 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
569 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
570 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
571 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
572 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
573 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
574 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
575 ; GFX7-NEXT: s_waitcnt vmcnt(1)
576 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
577 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
578 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
579 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
580 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
581 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
582 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
583 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
584 ; GFX7-NEXT: s_waitcnt vmcnt(0)
585 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
586 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
587 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
588 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
589 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
590 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
591 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
592 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
593 ; GFX7-NEXT: s_setpc_b64 s[30:31]
595 ; GFX8-LABEL: v_load_global_v32bf16:
597 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598 ; GFX8-NEXT: v_mov_b32_e32 v12, v0
599 ; GFX8-NEXT: v_mov_b32_e32 v13, v1
600 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v12
601 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v13, vcc
602 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v12
603 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
604 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[12:13]
605 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v12
606 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
607 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
608 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
609 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
610 ; GFX8-NEXT: s_waitcnt vmcnt(0)
611 ; GFX8-NEXT: s_setpc_b64 s[30:31]
613 ; GFX9-LABEL: v_load_global_v32bf16:
615 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616 ; GFX9-NEXT: v_mov_b32_e32 v17, v1
617 ; GFX9-NEXT: v_mov_b32_e32 v16, v0
618 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
619 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
620 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
621 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
622 ; GFX9-NEXT: s_waitcnt vmcnt(0)
623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
625 ; GFX10-LABEL: v_load_global_v32bf16:
627 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; GFX10-NEXT: v_mov_b32_e32 v17, v1
629 ; GFX10-NEXT: v_mov_b32_e32 v16, v0
630 ; GFX10-NEXT: s_clause 0x3
631 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
632 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
633 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
634 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
635 ; GFX10-NEXT: s_waitcnt vmcnt(0)
636 ; GFX10-NEXT: s_setpc_b64 s[30:31]
638 ; GFX11-LABEL: v_load_global_v32bf16:
640 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641 ; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
642 ; GFX11-NEXT: s_clause 0x3
643 ; GFX11-NEXT: global_load_b128 v[0:3], v[12:13], off
644 ; GFX11-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
645 ; GFX11-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
646 ; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
647 ; GFX11-NEXT: s_waitcnt vmcnt(0)
648 ; GFX11-NEXT: s_setpc_b64 s[30:31]
649 %load = load <32 x bfloat>, ptr addrspace(1) %ptr
650 ret <32 x bfloat> %load
653 define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
654 ; GCN-LABEL: v_load_global_v64bf16:
656 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657 ; GCN-NEXT: s_mov_b32 s7, 0xf000
658 ; GCN-NEXT: s_mov_b32 s6, 0
659 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
660 ; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
661 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
662 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
663 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
664 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
665 ; GCN-NEXT: s_mov_b32 s4, s6
666 ; GCN-NEXT: s_mov_b32 s5, s6
667 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
668 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
669 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
670 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
671 ; GCN-NEXT: s_waitcnt vmcnt(0)
672 ; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
673 ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
674 ; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
675 ; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
676 ; GCN-NEXT: s_waitcnt expcnt(0)
677 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
678 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
679 ; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
680 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
681 ; GCN-NEXT: s_waitcnt vmcnt(0)
682 ; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
683 ; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
684 ; GCN-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen
685 ; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
686 ; GCN-NEXT: s_waitcnt expcnt(0)
687 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
688 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0
689 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0
690 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0
691 ; GCN-NEXT: s_waitcnt vmcnt(0)
692 ; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
693 ; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
694 ; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
695 ; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
696 ; GCN-NEXT: s_waitcnt expcnt(0)
697 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
698 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0
699 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
700 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
701 ; GCN-NEXT: s_waitcnt vmcnt(0)
702 ; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
703 ; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
704 ; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
705 ; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
706 ; GCN-NEXT: s_waitcnt expcnt(0)
707 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
708 ; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
709 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
710 ; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
711 ; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
712 ; GCN-NEXT: s_waitcnt vmcnt(2)
713 ; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
714 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0
715 ; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen
716 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
717 ; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen
718 ; GCN-NEXT: s_waitcnt expcnt(0)
719 ; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0
720 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
721 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0
722 ; GCN-NEXT: s_waitcnt expcnt(0)
723 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0
724 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0
725 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0
726 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
727 ; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
728 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
729 ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
730 ; GCN-NEXT: s_waitcnt expcnt(0)
731 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0
732 ; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
733 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0
734 ; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
735 ; GCN-NEXT: s_waitcnt expcnt(0)
736 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0
737 ; GCN-NEXT: s_waitcnt vmcnt(8)
738 ; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen
739 ; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen
740 ; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
741 ; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
742 ; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
743 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
744 ; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
745 ; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
746 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
747 ; GCN-NEXT: s_setpc_b64 s[30:31]
749 ; GFX7-LABEL: v_load_global_v64bf16:
751 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752 ; GFX7-NEXT: s_mov_b32 s6, 0
753 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
754 ; GFX7-NEXT: s_mov_b32 s4, s6
755 ; GFX7-NEXT: s_mov_b32 s5, s6
756 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
757 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
758 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
759 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
760 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
761 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
762 ; GFX7-NEXT: s_waitcnt vmcnt(0)
763 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
764 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
765 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
766 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
767 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
768 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0
769 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0
770 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x64, v0
771 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0
772 ; GFX7-NEXT: s_waitcnt vmcnt(0)
773 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
774 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
775 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
776 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
777 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
778 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
779 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
780 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0
781 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0
782 ; GFX7-NEXT: s_waitcnt vmcnt(0)
783 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
784 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
785 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
786 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
787 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
788 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0
789 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0
790 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
791 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 64, v0
792 ; GFX7-NEXT: s_waitcnt vmcnt(0)
793 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
794 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
795 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
796 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
797 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
798 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
799 ; GFX7-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
800 ; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
801 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0
802 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
803 ; GFX7-NEXT: s_waitcnt vmcnt(3)
804 ; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
805 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0
806 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
807 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
808 ; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
809 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0
810 ; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
811 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0
812 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
813 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0
814 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0
815 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0
816 ; GFX7-NEXT: s_waitcnt vmcnt(6)
817 ; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
818 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
819 ; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
820 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0
821 ; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
822 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
823 ; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
824 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
825 ; GFX7-NEXT: s_waitcnt vmcnt(9)
826 ; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
827 ; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen
828 ; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
829 ; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
830 ; GFX7-NEXT: s_waitcnt vmcnt(12)
831 ; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen
832 ; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
833 ; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
834 ; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
835 ; GFX7-NEXT: s_waitcnt vmcnt(0)
836 ; GFX7-NEXT: s_setpc_b64 s[30:31]
838 ; GFX8-LABEL: v_load_global_v64bf16:
840 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841 ; GFX8-NEXT: v_mov_b32_e32 v28, v0
842 ; GFX8-NEXT: v_mov_b32_e32 v29, v1
843 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v28
844 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v29, vcc
845 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v28
846 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v29, vcc
847 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v28
848 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v29, vcc
849 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v28
850 ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc
851 ; GFX8-NEXT: s_movk_i32 s4, 0x50
852 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v28
853 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v29, vcc
854 ; GFX8-NEXT: s_movk_i32 s4, 0x60
855 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v28
856 ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
857 ; GFX8-NEXT: s_movk_i32 s4, 0x70
858 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
859 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
860 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
861 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
862 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
863 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
864 ; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
865 ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
866 ; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
867 ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
868 ; GFX8-NEXT: s_waitcnt vmcnt(0)
869 ; GFX8-NEXT: s_setpc_b64 s[30:31]
871 ; GFX9-LABEL: v_load_global_v64bf16:
873 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
874 ; GFX9-NEXT: v_mov_b32_e32 v29, v1
875 ; GFX9-NEXT: v_mov_b32_e32 v28, v0
876 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
877 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
878 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
879 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
880 ; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
881 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
882 ; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
884 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
885 ; GFX9-NEXT: s_waitcnt vmcnt(0)
886 ; GFX9-NEXT: s_setpc_b64 s[30:31]
888 ; GFX10-LABEL: v_load_global_v64bf16:
890 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891 ; GFX10-NEXT: v_mov_b32_e32 v33, v1
892 ; GFX10-NEXT: v_mov_b32_e32 v32, v0
893 ; GFX10-NEXT: s_clause 0x7
894 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[32:33], off
895 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[32:33], off offset:16
896 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[32:33], off offset:32
897 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[32:33], off offset:48
898 ; GFX10-NEXT: global_load_dwordx4 v[16:19], v[32:33], off offset:64
899 ; GFX10-NEXT: global_load_dwordx4 v[20:23], v[32:33], off offset:80
900 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v[32:33], off offset:96
901 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v[32:33], off offset:112
902 ; GFX10-NEXT: s_waitcnt vmcnt(0)
903 ; GFX10-NEXT: s_setpc_b64 s[30:31]
905 ; GFX11-LABEL: v_load_global_v64bf16:
907 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
908 ; GFX11-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
909 ; GFX11-NEXT: s_clause 0x7
910 ; GFX11-NEXT: global_load_b128 v[0:3], v[28:29], off
911 ; GFX11-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
912 ; GFX11-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
913 ; GFX11-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
914 ; GFX11-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
915 ; GFX11-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
916 ; GFX11-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
917 ; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
918 ; GFX11-NEXT: s_waitcnt vmcnt(0)
919 ; GFX11-NEXT: s_setpc_b64 s[30:31]
920 %load = load <64 x bfloat>, ptr addrspace(1) %ptr
921 ret <64 x bfloat> %load
924 define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
925 ; GCN-LABEL: v_store_global_v2bf16:
927 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
929 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
930 ; GCN-NEXT: s_mov_b32 s6, 0
931 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
932 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
933 ; GCN-NEXT: s_mov_b32 s7, 0xf000
934 ; GCN-NEXT: s_mov_b32 s4, s6
935 ; GCN-NEXT: s_mov_b32 s5, s6
936 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
937 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
938 ; GCN-NEXT: s_setpc_b64 s[30:31]
940 ; GFX7-LABEL: v_store_global_v2bf16:
942 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
944 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
945 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
946 ; GFX7-NEXT: s_mov_b32 s6, 0
947 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
948 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
949 ; GFX7-NEXT: s_mov_b32 s4, s6
950 ; GFX7-NEXT: s_mov_b32 s5, s6
951 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
952 ; GFX7-NEXT: s_waitcnt vmcnt(0)
953 ; GFX7-NEXT: s_setpc_b64 s[30:31]
955 ; GFX8-LABEL: v_store_global_v2bf16:
957 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
958 ; GFX8-NEXT: flat_store_dword v[1:2], v0
959 ; GFX8-NEXT: s_waitcnt vmcnt(0)
960 ; GFX8-NEXT: s_setpc_b64 s[30:31]
962 ; GFX9-LABEL: v_store_global_v2bf16:
964 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
965 ; GFX9-NEXT: global_store_dword v[1:2], v0, off
966 ; GFX9-NEXT: s_waitcnt vmcnt(0)
967 ; GFX9-NEXT: s_setpc_b64 s[30:31]
969 ; GFX10-LABEL: v_store_global_v2bf16:
971 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
972 ; GFX10-NEXT: global_store_dword v[1:2], v0, off
973 ; GFX10-NEXT: s_setpc_b64 s[30:31]
975 ; GFX11-LABEL: v_store_global_v2bf16:
977 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978 ; GFX11-NEXT: global_store_b32 v[1:2], v0, off
979 ; GFX11-NEXT: s_setpc_b64 s[30:31]
980 store <2 x bfloat> %val, ptr addrspace(1) %ptr
984 define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
985 ; GCN-LABEL: v_store_global_v3bf16:
987 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
989 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
990 ; GCN-NEXT: s_mov_b32 s7, 0xf000
991 ; GCN-NEXT: s_mov_b32 s6, 0
992 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
993 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
994 ; GCN-NEXT: s_mov_b32 s4, s6
995 ; GCN-NEXT: s_mov_b32 s5, s6
996 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
997 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
998 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
999 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
1000 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1001 ; GCN-NEXT: s_setpc_b64 s[30:31]
1003 ; GFX7-LABEL: v_store_global_v3bf16:
1005 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1007 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1008 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1009 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
1010 ; GFX7-NEXT: s_mov_b32 s6, 0
1011 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
1012 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1013 ; GFX7-NEXT: s_mov_b32 s4, s6
1014 ; GFX7-NEXT: s_mov_b32 s5, s6
1015 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1016 ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
1017 ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
1018 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1019 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1021 ; GFX8-LABEL: v_store_global_v3bf16:
1023 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024 ; GFX8-NEXT: flat_store_dword v[2:3], v0
1025 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
1026 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1027 ; GFX8-NEXT: flat_store_short v[2:3], v1
1028 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1029 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1031 ; GFX9-LABEL: v_store_global_v3bf16:
1033 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034 ; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
1035 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
1036 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1037 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1039 ; GFX10-LABEL: v_store_global_v3bf16:
1041 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1042 ; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
1043 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
1044 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1046 ; GFX11-LABEL: v_store_global_v3bf16:
1048 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049 ; GFX11-NEXT: s_clause 0x1
1050 ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
1051 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
1052 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1053 store <3 x bfloat> %val, ptr addrspace(1) %ptr
1057 define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
1058 ; GCN-LABEL: v_store_global_v4bf16:
1060 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1061 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1062 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
1063 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1064 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1065 ; GCN-NEXT: s_mov_b32 s6, 0
1066 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1067 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1068 ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
1069 ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
1070 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1071 ; GCN-NEXT: s_mov_b32 s4, s6
1072 ; GCN-NEXT: s_mov_b32 s5, s6
1073 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
1074 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1075 ; GCN-NEXT: s_setpc_b64 s[30:31]
1077 ; GFX7-LABEL: v_store_global_v4bf16:
1079 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1081 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1082 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1083 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1084 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1085 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1086 ; GFX7-NEXT: s_mov_b32 s6, 0
1087 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
1088 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
1089 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1090 ; GFX7-NEXT: s_mov_b32 s4, s6
1091 ; GFX7-NEXT: s_mov_b32 s5, s6
1092 ; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
1093 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1094 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1096 ; GFX8-LABEL: v_store_global_v4bf16:
1098 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1100 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1101 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1103 ; GFX9-LABEL: v_store_global_v4bf16:
1105 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1106 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1107 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1110 ; GFX10-LABEL: v_store_global_v4bf16:
1112 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1113 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1114 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1116 ; GFX11-LABEL: v_store_global_v4bf16:
1118 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1119 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
1120 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1121 store <4 x bfloat> %val, ptr addrspace(1) %ptr
1125 define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
1126 ; GCN-LABEL: v_store_global_v8bf16:
1128 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1129 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1130 ; GCN-NEXT: s_mov_b32 s6, 0
1131 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1132 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1133 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1134 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1135 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1136 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
1137 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1138 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1139 ; GCN-NEXT: s_mov_b32 s4, s6
1140 ; GCN-NEXT: s_mov_b32 s5, s6
1141 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
1142 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1143 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
1144 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
1145 ; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
1146 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
1147 ; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
1148 ; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
1149 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1150 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1151 ; GCN-NEXT: s_setpc_b64 s[30:31]
1153 ; GFX7-LABEL: v_store_global_v8bf16:
1155 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1156 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1157 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1158 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1159 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1160 ; GFX7-NEXT: s_mov_b32 s6, 0
1161 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1162 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1163 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1164 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1165 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1166 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1167 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1168 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1169 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1170 ; GFX7-NEXT: s_mov_b32 s4, s6
1171 ; GFX7-NEXT: s_mov_b32 s5, s6
1172 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
1173 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
1174 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
1175 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
1176 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
1177 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1178 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1180 ; GFX8-LABEL: v_store_global_v8bf16:
1182 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1183 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1184 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1185 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1187 ; GFX9-LABEL: v_store_global_v8bf16:
1189 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1190 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
1191 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1192 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1194 ; GFX10-LABEL: v_store_global_v8bf16:
1196 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
1198 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1200 ; GFX11-LABEL: v_store_global_v8bf16:
1202 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1203 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
1204 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1205 store <8 x bfloat> %val, ptr addrspace(1) %ptr
1209 define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
1210 ; GCN-LABEL: v_store_global_v16bf16:
1212 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1213 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1214 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1215 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1216 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1217 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1218 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
1219 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1220 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1221 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1222 ; GCN-NEXT: s_mov_b32 s6, 0
1223 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
1224 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1225 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
1226 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1227 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
1228 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1229 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1230 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1231 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1232 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1233 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1234 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
1235 ; GCN-NEXT: s_mov_b32 s4, s6
1236 ; GCN-NEXT: s_mov_b32 s5, s6
1237 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
1238 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
1239 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
1240 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
1241 ; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
1242 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
1243 ; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
1244 ; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
1245 ; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
1246 ; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
1247 ; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
1248 ; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
1249 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
1250 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
1251 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1252 ; GCN-NEXT: s_setpc_b64 s[30:31]
1254 ; GFX7-LABEL: v_store_global_v16bf16:
1256 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1257 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1258 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1259 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1260 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1261 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1262 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1263 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1264 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1265 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1266 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
1267 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
1268 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
1269 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
1270 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1271 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
1272 ; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
1273 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
1274 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1275 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
1276 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
1277 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
1278 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1279 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
1280 ; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
1281 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
1282 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1283 ; GFX7-NEXT: s_mov_b32 s6, 0
1284 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1285 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
1286 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1287 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1288 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1289 ; GFX7-NEXT: s_mov_b32 s4, s6
1290 ; GFX7-NEXT: s_mov_b32 s5, s6
1291 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
1292 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
1293 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
1294 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
1295 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1296 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1298 ; GFX8-LABEL: v_store_global_v16bf16:
1300 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1302 ; GFX8-NEXT: s_nop 0
1303 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
1304 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
1305 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
1306 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1307 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1309 ; GFX9-LABEL: v_store_global_v16bf16:
1311 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1312 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
1313 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
1314 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1315 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1317 ; GFX10-LABEL: v_store_global_v16bf16:
1319 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
1321 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
1322 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1324 ; GFX11-LABEL: v_store_global_v16bf16:
1326 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1327 ; GFX11-NEXT: s_clause 0x1
1328 ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
1329 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
1330 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1331 store <16 x bfloat> %val, ptr addrspace(1) %ptr
1335 define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
1336 ; GCN-LABEL: v_store_global_v32bf16:
1338 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1339 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
1340 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
1341 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
1342 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
1343 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
1344 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
1345 ; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
1346 ; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
1347 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
1348 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
1349 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
1350 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
1351 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
1352 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
1353 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
1354 ; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
1355 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1356 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1357 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1358 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1359 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1360 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5
1361 ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
1362 ; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16
1363 ; GCN-NEXT: s_mov_b32 s6, 0
1364 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1365 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1366 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
1367 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1
1368 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0
1369 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
1370 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1371 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
1372 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1373 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
1374 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1375 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1376 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8
1377 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
1378 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
1379 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
1380 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
1381 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
1382 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
1383 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3
1384 ; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16
1385 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
1386 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
1387 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
1388 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30
1389 ; GCN-NEXT: s_mov_b32 s4, s6
1390 ; GCN-NEXT: s_mov_b32 s5, s6
1391 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6
1392 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15
1393 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
1394 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
1395 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9
1396 ; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8
1397 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
1398 ; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
1399 ; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16
1400 ; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16
1401 ; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16
1402 ; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16
1403 ; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16
1404 ; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16
1405 ; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16
1406 ; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16
1407 ; GCN-NEXT: s_waitcnt vmcnt(1)
1408 ; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
1409 ; GCN-NEXT: s_waitcnt vmcnt(1)
1410 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26
1411 ; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
1412 ; GCN-NEXT: s_waitcnt expcnt(0)
1413 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
1414 ; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16
1415 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
1416 ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
1417 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1418 ; GCN-NEXT: s_setpc_b64 s[30:31]
1420 ; GFX7-LABEL: v_store_global_v32bf16:
1422 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1423 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1424 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1425 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1426 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1427 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1428 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1429 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
1430 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
1431 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
1432 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32
1433 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
1434 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1435 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
1436 ; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
1437 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
1438 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1439 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1440 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1441 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1442 ; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16
1443 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5
1444 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
1445 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
1446 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
1447 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
1448 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
1449 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
1450 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1451 ; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
1452 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
1453 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
1454 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1455 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
1456 ; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16
1457 ; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
1458 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
1459 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
1460 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30
1461 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
1462 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27
1463 ; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16
1464 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
1465 ; GFX7-NEXT: s_mov_b32 s6, 0
1466 ; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16
1467 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1468 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1469 ; GFX7-NEXT: s_mov_b32 s4, s6
1470 ; GFX7-NEXT: s_mov_b32 s5, s6
1471 ; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16
1472 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1473 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14
1474 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1475 ; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16
1476 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9
1477 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
1478 ; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
1479 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
1480 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1481 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
1482 ; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
1483 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
1484 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21
1485 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1486 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
1487 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1488 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20
1489 ; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
1490 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
1491 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16
1492 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1493 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
1494 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
1495 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1496 ; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
1497 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
1498 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
1499 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
1500 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1501 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1503 ; GFX8-LABEL: v_store_global_v32bf16:
1505 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1506 ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1507 ; GFX8-NEXT: s_nop 0
1508 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16
1509 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
1510 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
1511 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16
1512 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
1513 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
1514 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16
1515 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
1516 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
1517 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1518 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1520 ; GFX9-LABEL: v_store_global_v32bf16:
1522 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1523 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
1524 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
1525 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
1526 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
1527 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1528 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1530 ; GFX10-LABEL: v_store_global_v32bf16:
1532 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1533 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
1534 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
1535 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
1536 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
1537 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1539 ; GFX11-LABEL: v_store_global_v32bf16:
1541 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1542 ; GFX11-NEXT: s_clause 0x3
1543 ; GFX11-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48
1544 ; GFX11-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32
1545 ; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16
1546 ; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off
1547 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1548 store <32 x bfloat> %val, ptr addrspace(1) %ptr
1552 define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
1553 ; GCN-LABEL: v_store_global_v64bf16:
1555 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1556 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
1557 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
1558 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
1559 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
1560 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
1561 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
1562 ; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
1563 ; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
1564 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
1565 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
1566 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
1567 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
1568 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
1569 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
1570 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
1571 ; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
1572 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
1573 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1574 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
1575 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1576 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
1577 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
1578 ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
1579 ; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
1580 ; GCN-NEXT: s_mov_b32 s6, 0
1581 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1582 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
1583 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1584 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1585 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1586 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1587 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1588 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1589 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1590 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1591 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2
1592 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1593 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1594 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29
1595 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28
1596 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
1597 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
1598 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
1599 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
1600 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1601 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1602 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3
1603 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1
1604 ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2
1605 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
1606 ; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
1607 ; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
1608 ; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
1609 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
1610 ; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16
1611 ; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16
1612 ; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16
1613 ; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16
1614 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136
1615 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132
1616 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128
1617 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124
1618 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
1619 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
1620 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112
1621 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
1622 ; GCN-NEXT: s_mov_b32 s4, s6
1623 ; GCN-NEXT: s_mov_b32 s5, s6
1624 ; GCN-NEXT: s_waitcnt vmcnt(6)
1625 ; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32
1626 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16
1627 ; GCN-NEXT: s_waitcnt expcnt(0)
1628 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
1629 ; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100
1630 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
1631 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92
1632 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88
1633 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84
1634 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80
1635 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
1636 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
1637 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
1638 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30
1639 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
1640 ; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
1641 ; GCN-NEXT: s_waitcnt vmcnt(14)
1642 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1643 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1644 ; GCN-NEXT: s_waitcnt vmcnt(13)
1645 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
1646 ; GCN-NEXT: s_waitcnt vmcnt(12)
1647 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
1648 ; GCN-NEXT: s_waitcnt vmcnt(11)
1649 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
1650 ; GCN-NEXT: s_waitcnt vmcnt(10)
1651 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
1652 ; GCN-NEXT: s_waitcnt vmcnt(7)
1653 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1654 ; GCN-NEXT: s_waitcnt vmcnt(6)
1655 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11
1656 ; GCN-NEXT: s_waitcnt vmcnt(5)
1657 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
1658 ; GCN-NEXT: s_waitcnt vmcnt(4)
1659 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13
1660 ; GCN-NEXT: s_waitcnt vmcnt(3)
1661 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18
1662 ; GCN-NEXT: s_waitcnt vmcnt(2)
1663 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
1664 ; GCN-NEXT: s_waitcnt vmcnt(1)
1665 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20
1666 ; GCN-NEXT: s_waitcnt vmcnt(0)
1667 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
1668 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1669 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
1670 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
1671 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
1672 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11
1673 ; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
1674 ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13
1675 ; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16
1676 ; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16
1677 ; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16
1678 ; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16
1679 ; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16
1680 ; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16
1681 ; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16
1682 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
1683 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
1684 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
1685 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32
1686 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
1687 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
1688 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
1689 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
1690 ; GCN-NEXT: s_waitcnt vmcnt(7)
1691 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1692 ; GCN-NEXT: s_waitcnt vmcnt(6)
1693 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1694 ; GCN-NEXT: s_waitcnt vmcnt(5)
1695 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
1696 ; GCN-NEXT: s_waitcnt vmcnt(4)
1697 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
1698 ; GCN-NEXT: s_waitcnt vmcnt(3)
1699 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
1700 ; GCN-NEXT: s_waitcnt vmcnt(2)
1701 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
1702 ; GCN-NEXT: s_waitcnt vmcnt(1)
1703 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
1704 ; GCN-NEXT: s_waitcnt vmcnt(0)
1705 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
1706 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1707 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
1708 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
1709 ; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21
1710 ; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16
1711 ; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16
1712 ; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16
1713 ; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16
1714 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
1715 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8
1716 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4
1717 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64
1718 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60
1719 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
1720 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52
1721 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48
1722 ; GCN-NEXT: s_waitcnt vmcnt(7)
1723 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
1724 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23
1725 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
1726 ; GCN-NEXT: s_waitcnt vmcnt(6)
1727 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
1728 ; GCN-NEXT: s_waitcnt vmcnt(5)
1729 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
1730 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
1731 ; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
1732 ; GCN-NEXT: s_waitcnt vmcnt(4)
1733 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
1734 ; GCN-NEXT: s_waitcnt vmcnt(3)
1735 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
1736 ; GCN-NEXT: s_waitcnt vmcnt(2)
1737 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27
1738 ; GCN-NEXT: s_waitcnt vmcnt(1)
1739 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
1740 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
1741 ; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
1742 ; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16
1743 ; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16
1744 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
1745 ; GCN-NEXT: s_waitcnt vmcnt(1)
1746 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29
1747 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
1748 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
1749 ; GCN-NEXT: s_waitcnt vmcnt(2)
1750 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
1751 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
1752 ; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16
1753 ; GCN-NEXT: s_waitcnt vmcnt(1)
1754 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
1755 ; GCN-NEXT: s_waitcnt vmcnt(0)
1756 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
1757 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
1758 ; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16
1759 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
1760 ; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
1761 ; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
1762 ; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
1763 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
1764 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1765 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1766 ; GCN-NEXT: s_setpc_b64 s[30:31]
1768 ; GFX7-LABEL: v_store_global_v64bf16:
1770 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1771 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
1772 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
1773 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
1774 ; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116
1775 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112
1776 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108
1777 ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104
1778 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100
1779 ; GFX7-NEXT: s_mov_b32 s6, 0
1780 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1781 ; GFX7-NEXT: s_mov_b32 s4, s6
1782 ; GFX7-NEXT: s_mov_b32 s5, s6
1783 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1784 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1785 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1786 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1787 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1788 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1789 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1790 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
1791 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
1792 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
1793 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1794 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1795 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1796 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1797 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
1798 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
1799 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1800 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
1801 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1802 ; GFX7-NEXT: s_waitcnt vmcnt(7)
1803 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
1804 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1805 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
1806 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
1807 ; GFX7-NEXT: s_waitcnt vmcnt(5)
1808 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
1809 ; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16
1810 ; GFX7-NEXT: s_waitcnt vmcnt(3)
1811 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37
1812 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34
1813 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1814 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1815 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38
1816 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
1817 ; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16
1818 ; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16
1819 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1820 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39
1821 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1822 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48
1823 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
1824 ; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16
1825 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
1826 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
1827 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96
1828 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92
1829 ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
1830 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84
1831 ; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80
1832 ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
1833 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1834 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
1835 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1836 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37
1837 ; GFX7-NEXT: s_waitcnt vmcnt(5)
1838 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38
1839 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1840 ; GFX7-NEXT: s_waitcnt vmcnt(4)
1841 ; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39
1842 ; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
1843 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1844 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49
1845 ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48
1846 ; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
1847 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1848 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50
1849 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1850 ; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
1851 ; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
1852 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
1853 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
1854 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64
1855 ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60
1856 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
1857 ; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52
1858 ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48
1859 ; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
1860 ; GFX7-NEXT: s_waitcnt vmcnt(7)
1861 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
1862 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1863 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1864 ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
1865 ; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
1866 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
1867 ; GFX7-NEXT: s_waitcnt vmcnt(3)
1868 ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
1869 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
1870 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1871 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
1872 ; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
1873 ; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
1874 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1875 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
1876 ; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
1877 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1878 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1879 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
1880 ; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
1881 ; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
1882 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
1883 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
1884 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
1885 ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
1886 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
1887 ; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20
1888 ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
1889 ; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
1890 ; GFX7-NEXT: s_waitcnt vmcnt(7)
1891 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
1892 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1893 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1894 ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
1895 ; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
1896 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
1897 ; GFX7-NEXT: s_waitcnt vmcnt(3)
1898 ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
1899 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
1900 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1901 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
1902 ; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
1903 ; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
1904 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1905 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
1906 ; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
1907 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1908 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1909 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
1910 ; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
1911 ; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
1912 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
1913 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
1914 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32
1915 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1916 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
1917 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1918 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1919 ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
1920 ; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
1921 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
1922 ; GFX7-NEXT: s_nop 0
1923 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
1924 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
1925 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
1926 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
1927 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
1928 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1929 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
1930 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
1931 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
1932 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1933 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
1934 ; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
1935 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
1936 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1937 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
1938 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1939 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
1940 ; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
1941 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
1942 ; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
1943 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
1944 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1945 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
1946 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1947 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
1948 ; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
1949 ; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
1950 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
1951 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1952 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
1953 ; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16
1954 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1955 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38
1956 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1957 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
1958 ; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16
1959 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
1960 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1961 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
1962 ; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
1963 ; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16
1964 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
1965 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1966 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24
1967 ; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
1968 ; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
1969 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
1970 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
1971 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
1972 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64
1973 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1974 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1976 ; GFX8-LABEL: v_store_global_v64bf16:
1978 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
1980 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
1981 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
1982 ; GFX8-NEXT: s_movk_i32 s4, 0x70
1983 ; GFX8-NEXT: s_movk_i32 s5, 0x50
1984 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1985 ; GFX8-NEXT: v_add_u32_e32 v34, vcc, s4, v32
1986 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1987 ; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc
1988 ; GFX8-NEXT: s_movk_i32 s4, 0x60
1989 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1990 ; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[28:31]
1991 ; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
1992 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v32
1993 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
1994 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v32
1995 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
1996 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v32
1997 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v33, vcc
1998 ; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
1999 ; GFX8-NEXT: s_nop 0
2000 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, 48, v32
2001 ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v33, vcc
2002 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, 32, v32
2003 ; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v33, vcc
2004 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, 16, v32
2005 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
2006 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
2007 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
2008 ; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
2009 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
2010 ; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7]
2011 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2012 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2014 ; GFX9-LABEL: v_store_global_v64bf16:
2016 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2017 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
2018 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
2019 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
2020 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2021 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
2022 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
2023 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
2024 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
2025 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
2026 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
2027 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
2028 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
2029 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2030 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2032 ; GFX10-LABEL: v_store_global_v64bf16:
2034 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2035 ; GFX10-NEXT: s_clause 0x2
2036 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
2037 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
2038 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
2039 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2040 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
2041 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
2042 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
2043 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
2044 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
2045 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
2046 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
2047 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
2048 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2050 ; GFX11-LABEL: v_store_global_v64bf16:
2052 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2053 ; GFX11-NEXT: s_clause 0x2
2054 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
2055 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
2056 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
2057 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2058 ; GFX11-NEXT: s_clause 0x7
2059 ; GFX11-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112
2060 ; GFX11-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96
2061 ; GFX11-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80
2062 ; GFX11-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64
2063 ; GFX11-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48
2064 ; GFX11-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32
2065 ; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16
2066 ; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off
2067 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2068 store <64 x bfloat> %val, ptr addrspace(1) %ptr
2072 define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
2073 ; GCN-LABEL: test_store_fpimm:
2075 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2076 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2077 ; GCN-NEXT: s_mov_b32 s6, 0
2078 ; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80
2079 ; GCN-NEXT: v_mov_b32_e32 v5, 0x4228
2080 ; GCN-NEXT: s_mov_b32 s4, s6
2081 ; GCN-NEXT: s_mov_b32 s5, s6
2082 ; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
2083 ; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64
2084 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2085 ; GCN-NEXT: s_setpc_b64 s[30:31]
2087 ; GFX7-LABEL: test_store_fpimm:
2089 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2090 ; GFX7-NEXT: s_mov_b32 s6, 0
2091 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2092 ; GFX7-NEXT: s_mov_b32 s4, s6
2093 ; GFX7-NEXT: s_mov_b32 s5, s6
2094 ; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80
2095 ; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
2096 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228
2097 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2098 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2099 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2101 ; GFX8-LABEL: test_store_fpimm:
2103 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2104 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80
2105 ; GFX8-NEXT: flat_store_short v[0:1], v4
2106 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228
2107 ; GFX8-NEXT: flat_store_short v[2:3], v0
2108 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2109 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2111 ; GFX9-LABEL: test_store_fpimm:
2113 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2114 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80
2115 ; GFX9-NEXT: global_store_short v[0:1], v4, off
2116 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228
2117 ; GFX9-NEXT: global_store_short v[2:3], v0, off
2118 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2119 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2121 ; GFX10-LABEL: test_store_fpimm:
2123 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2124 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80
2125 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228
2126 ; GFX10-NEXT: global_store_short v[0:1], v4, off
2127 ; GFX10-NEXT: global_store_short v[2:3], v5, off
2128 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2130 ; GFX11-LABEL: test_store_fpimm:
2132 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2133 ; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80
2134 ; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228
2135 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off
2136 ; GFX11-NEXT: global_store_b16 v[2:3], v5, off
2137 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2138 store bfloat 1.0, ptr addrspace(1) %ptr0
2139 store bfloat 42.0, ptr addrspace(1) %ptr1
2143 define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2144 ; GCN-LABEL: test_load_store_f32_to_bf16:
2146 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2147 ; GCN-NEXT: s_mov_b32 s6, 0
2148 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2149 ; GCN-NEXT: s_mov_b32 s4, s6
2150 ; GCN-NEXT: s_mov_b32 s5, s6
2151 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2152 ; GCN-NEXT: s_waitcnt vmcnt(0)
2153 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2154 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2155 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2156 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2157 ; GCN-NEXT: s_setpc_b64 s[30:31]
2159 ; GFX7-LABEL: test_load_store_f32_to_bf16:
2161 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2162 ; GFX7-NEXT: s_mov_b32 s6, 0
2163 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2164 ; GFX7-NEXT: s_mov_b32 s4, s6
2165 ; GFX7-NEXT: s_mov_b32 s5, s6
2166 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2167 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2168 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2169 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2170 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2171 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2172 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2174 ; GFX8-LABEL: test_load_store_f32_to_bf16:
2176 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2177 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2178 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2179 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
2180 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
2181 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
2182 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
2183 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
2184 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
2185 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2186 ; GFX8-NEXT: flat_store_short v[2:3], v0
2187 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2188 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2190 ; GFX9-LABEL: test_load_store_f32_to_bf16:
2192 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2193 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2194 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
2195 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2196 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
2197 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
2198 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
2199 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
2200 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
2201 ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
2202 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2203 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2205 ; GFX10-LABEL: test_load_store_f32_to_bf16:
2207 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2208 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2209 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2210 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
2211 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
2212 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
2213 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
2214 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
2215 ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
2216 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2218 ; GFX11-LABEL: test_load_store_f32_to_bf16:
2220 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2221 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2222 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2223 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
2224 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
2225 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
2226 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2227 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
2228 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
2229 ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
2230 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2231 %val = load float, ptr addrspace(1) %in
2232 %val.bf16 = fptrunc float %val to bfloat
2233 store bfloat %val.bf16, ptr addrspace(1) %out
2237 define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2238 ; GCN-LABEL: test_load_store_f64_to_bf16:
2240 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2241 ; GCN-NEXT: s_mov_b32 s6, 0
2242 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2243 ; GCN-NEXT: s_mov_b32 s4, s6
2244 ; GCN-NEXT: s_mov_b32 s5, s6
2245 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2246 ; GCN-NEXT: s_waitcnt vmcnt(0)
2247 ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2248 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2249 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2250 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2251 ; GCN-NEXT: s_setpc_b64 s[30:31]
2253 ; GFX7-LABEL: test_load_store_f64_to_bf16:
2255 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2256 ; GFX7-NEXT: s_mov_b32 s6, 0
2257 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2258 ; GFX7-NEXT: s_mov_b32 s4, s6
2259 ; GFX7-NEXT: s_mov_b32 s5, s6
2260 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2261 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2262 ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2263 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2264 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2265 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2266 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2268 ; GFX8-LABEL: test_load_store_f64_to_bf16:
2270 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2271 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2272 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2273 ; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2274 ; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1
2275 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2276 ; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
2277 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
2278 ; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2279 ; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2280 ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5]
2281 ; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4
2282 ; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc
2283 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2284 ; GFX8-NEXT: v_or_b32_e32 v5, v4, v7
2285 ; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
2286 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
2287 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
2288 ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2289 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5
2290 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
2291 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2292 ; GFX8-NEXT: flat_store_short v[2:3], v0
2293 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2294 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2296 ; GFX9-LABEL: test_load_store_f64_to_bf16:
2298 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2299 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2300 ; GFX9-NEXT: s_brev_b32 s8, 1
2301 ; GFX9-NEXT: s_movk_i32 s9, 0x7fff
2302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2303 ; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2304 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2305 ; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
2306 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
2307 ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2308 ; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2309 ; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2310 ; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
2311 ; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc
2312 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2313 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2314 ; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
2315 ; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
2316 ; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
2317 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5
2318 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
2319 ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
2320 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2321 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2323 ; GFX10-LABEL: test_load_store_f64_to_bf16:
2325 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2326 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2327 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2328 ; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2329 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2330 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v6
2331 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
2332 ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
2333 ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
2334 ; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
2335 ; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo
2336 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
2337 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2338 ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2339 ; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
2340 ; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
2341 ; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
2342 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
2343 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2344 ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
2345 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2347 ; GFX11-LABEL: test_load_store_f64_to_bf16:
2349 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2350 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2351 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2352 ; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2353 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2354 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2355 ; GFX11-NEXT: v_and_b32_e32 v7, 1, v6
2356 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
2357 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2358 ; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
2359 ; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
2360 ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
2361 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2362 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
2363 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
2364 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2365 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2366 ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2367 ; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
2368 ; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
2369 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2370 ; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
2371 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
2372 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2373 ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
2374 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2375 %val = load double, ptr addrspace(1) %in
2376 %val.bf16 = fptrunc double %val to bfloat
2377 store bfloat %val.bf16, ptr addrspace(1) %out
2381 define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2382 ; GCN-LABEL: test_load_store_bf16_to_f32:
2384 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2385 ; GCN-NEXT: s_mov_b32 s6, 0
2386 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2387 ; GCN-NEXT: s_mov_b32 s4, s6
2388 ; GCN-NEXT: s_mov_b32 s5, s6
2389 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2390 ; GCN-NEXT: s_waitcnt vmcnt(0)
2391 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2392 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2393 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2394 ; GCN-NEXT: s_setpc_b64 s[30:31]
2396 ; GFX7-LABEL: test_load_store_bf16_to_f32:
2398 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2399 ; GFX7-NEXT: s_mov_b32 s6, 0
2400 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2401 ; GFX7-NEXT: s_mov_b32 s4, s6
2402 ; GFX7-NEXT: s_mov_b32 s5, s6
2403 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2404 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2405 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2406 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2407 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2408 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2410 ; GFX8-LABEL: test_load_store_bf16_to_f32:
2412 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2413 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
2414 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2415 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2416 ; GFX8-NEXT: flat_store_dword v[2:3], v0
2417 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2418 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2420 ; GFX9-LABEL: test_load_store_bf16_to_f32:
2422 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2423 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
2424 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2425 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2426 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
2427 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2428 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2430 ; GFX10-LABEL: test_load_store_bf16_to_f32:
2432 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2433 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
2434 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2435 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2436 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
2437 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2439 ; GFX11-LABEL: test_load_store_bf16_to_f32:
2441 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2442 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
2443 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2444 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2445 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
2446 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2447 %val = load bfloat, ptr addrspace(1) %in
2448 %val.f32 = fpext bfloat %val to float
2449 store float %val.f32, ptr addrspace(1) %out
2453 define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2454 ; GCN-LABEL: test_load_store_bf16_to_f64:
2456 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2457 ; GCN-NEXT: s_mov_b32 s6, 0
2458 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2459 ; GCN-NEXT: s_mov_b32 s4, s6
2460 ; GCN-NEXT: s_mov_b32 s5, s6
2461 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2462 ; GCN-NEXT: s_waitcnt vmcnt(0)
2463 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2464 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2465 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2466 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2467 ; GCN-NEXT: s_setpc_b64 s[30:31]
2469 ; GFX7-LABEL: test_load_store_bf16_to_f64:
2471 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2472 ; GFX7-NEXT: s_mov_b32 s6, 0
2473 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2474 ; GFX7-NEXT: s_mov_b32 s4, s6
2475 ; GFX7-NEXT: s_mov_b32 s5, s6
2476 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2477 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2478 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2479 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2480 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2481 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2482 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2484 ; GFX8-LABEL: test_load_store_bf16_to_f64:
2486 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2487 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
2488 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2489 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2490 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2491 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2492 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2493 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2495 ; GFX9-LABEL: test_load_store_bf16_to_f64:
2497 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
2499 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2500 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2501 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2502 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2503 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2504 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2506 ; GFX10-LABEL: test_load_store_bf16_to_f64:
2508 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2509 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
2510 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2511 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2512 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2513 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2514 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2516 ; GFX11-LABEL: test_load_store_bf16_to_f64:
2518 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2519 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
2520 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2521 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2522 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2523 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2524 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2525 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2526 %val = load bfloat, ptr addrspace(1) %in
2527 %val.f64 = fpext bfloat %val to double
2528 store double %val.f64, ptr addrspace(1) %out
2532 define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2533 ; GCN-LABEL: test_load_store_v2bf16:
2535 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2536 ; GCN-NEXT: s_mov_b32 s6, 0
2537 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2538 ; GCN-NEXT: s_mov_b32 s4, s6
2539 ; GCN-NEXT: s_mov_b32 s5, s6
2540 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2541 ; GCN-NEXT: s_waitcnt vmcnt(0)
2542 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2543 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2544 ; GCN-NEXT: s_setpc_b64 s[30:31]
2546 ; GFX7-LABEL: test_load_store_v2bf16:
2548 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2549 ; GFX7-NEXT: s_mov_b32 s6, 0
2550 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2551 ; GFX7-NEXT: s_mov_b32 s4, s6
2552 ; GFX7-NEXT: s_mov_b32 s5, s6
2553 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2554 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2555 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2556 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2559 ; GFX8-LABEL: test_load_store_v2bf16:
2561 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2563 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2564 ; GFX8-NEXT: flat_store_dword v[2:3], v0
2565 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2566 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2568 ; GFX9-LABEL: test_load_store_v2bf16:
2570 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2571 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2572 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2573 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
2574 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2575 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2577 ; GFX10-LABEL: test_load_store_v2bf16:
2579 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2581 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2582 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
2583 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2585 ; GFX11-LABEL: test_load_store_v2bf16:
2587 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2588 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2589 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2590 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
2591 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2592 %val = load <2 x bfloat>, ptr addrspace(1) %in
2593 store <2 x bfloat> %val, ptr addrspace(1) %out
2597 define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2598 ; GCN-LABEL: test_load_store_v4bf16:
2600 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2601 ; GCN-NEXT: s_mov_b32 s6, 0
2602 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2603 ; GCN-NEXT: s_mov_b32 s4, s6
2604 ; GCN-NEXT: s_mov_b32 s5, s6
2605 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2606 ; GCN-NEXT: s_waitcnt vmcnt(0)
2607 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2608 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2609 ; GCN-NEXT: s_setpc_b64 s[30:31]
2611 ; GFX7-LABEL: test_load_store_v4bf16:
2613 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2614 ; GFX7-NEXT: s_mov_b32 s6, 0
2615 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2616 ; GFX7-NEXT: s_mov_b32 s4, s6
2617 ; GFX7-NEXT: s_mov_b32 s5, s6
2618 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2619 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2620 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2621 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2622 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2624 ; GFX8-LABEL: test_load_store_v4bf16:
2626 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2627 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2628 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2629 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2630 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2631 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2633 ; GFX9-LABEL: test_load_store_v4bf16:
2635 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2637 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2638 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2640 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2642 ; GFX10-LABEL: test_load_store_v4bf16:
2644 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2645 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2646 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2647 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2648 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2650 ; GFX11-LABEL: test_load_store_v4bf16:
2652 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2653 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2654 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2655 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2656 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2657 %val = load <4 x bfloat>, ptr addrspace(1) %in
2658 store <4 x bfloat> %val, ptr addrspace(1) %out
2662 define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2663 ; GCN-LABEL: test_load_store_v8bf16:
2665 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2666 ; GCN-NEXT: s_mov_b32 s6, 0
2667 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2668 ; GCN-NEXT: s_mov_b32 s4, s6
2669 ; GCN-NEXT: s_mov_b32 s5, s6
2670 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
2671 ; GCN-NEXT: s_waitcnt vmcnt(0)
2672 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
2673 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2674 ; GCN-NEXT: s_setpc_b64 s[30:31]
2676 ; GFX7-LABEL: test_load_store_v8bf16:
2678 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2679 ; GFX7-NEXT: s_mov_b32 s6, 0
2680 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2681 ; GFX7-NEXT: s_mov_b32 s4, s6
2682 ; GFX7-NEXT: s_mov_b32 s5, s6
2683 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
2684 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2685 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
2686 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2687 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2689 ; GFX8-LABEL: test_load_store_v8bf16:
2691 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2692 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
2693 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2694 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
2695 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2696 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2698 ; GFX9-LABEL: test_load_store_v8bf16:
2700 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2701 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
2702 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2703 ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
2704 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2705 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2707 ; GFX10-LABEL: test_load_store_v8bf16:
2709 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2710 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
2711 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2712 ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
2713 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2715 ; GFX11-LABEL: test_load_store_v8bf16:
2717 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2718 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
2719 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2720 ; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off
2721 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2722 %val = load <8 x bfloat>, ptr addrspace(1) %in
2723 store <8 x bfloat> %val, ptr addrspace(1) %out
2727 define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2728 ; GCN-LABEL: test_load_store_v16bf16:
2730 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2731 ; GCN-NEXT: s_mov_b32 s6, 0
2732 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2733 ; GCN-NEXT: s_mov_b32 s4, s6
2734 ; GCN-NEXT: s_mov_b32 s5, s6
2735 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
2736 ; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
2737 ; GCN-NEXT: s_waitcnt vmcnt(1)
2738 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
2739 ; GCN-NEXT: s_waitcnt vmcnt(1)
2740 ; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
2741 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2742 ; GCN-NEXT: s_setpc_b64 s[30:31]
2744 ; GFX7-LABEL: test_load_store_v16bf16:
2746 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2747 ; GFX7-NEXT: s_mov_b32 s6, 0
2748 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2749 ; GFX7-NEXT: s_mov_b32 s4, s6
2750 ; GFX7-NEXT: s_mov_b32 s5, s6
2751 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
2752 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
2753 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2754 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
2755 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2756 ; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
2757 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2758 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2760 ; GFX8-LABEL: test_load_store_v16bf16:
2762 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2763 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0
2764 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
2765 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
2766 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
2767 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
2768 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
2769 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2770 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
2771 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2772 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
2773 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2774 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2776 ; GFX9-LABEL: test_load_store_v16bf16:
2778 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2779 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
2780 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
2781 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2782 ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16
2783 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2784 ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off
2785 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2786 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2788 ; GFX10-LABEL: test_load_store_v16bf16:
2790 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2791 ; GFX10-NEXT: s_clause 0x1
2792 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
2793 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
2794 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2795 ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16
2796 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2797 ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off
2798 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2800 ; GFX11-LABEL: test_load_store_v16bf16:
2802 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803 ; GFX11-NEXT: s_clause 0x1
2804 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
2805 ; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off
2806 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2807 ; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
2808 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2809 ; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off
2810 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2811 %val = load <16 x bfloat>, ptr addrspace(1) %in
2812 store <16 x bfloat> %val, ptr addrspace(1) %out
2816 define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
2817 ; GCN-LABEL: test_arg_store:
2819 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2820 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2821 ; GCN-NEXT: s_mov_b32 s6, 0
2822 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2823 ; GCN-NEXT: s_mov_b32 s4, s6
2824 ; GCN-NEXT: s_mov_b32 s5, s6
2825 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2826 ; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
2827 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2828 ; GCN-NEXT: s_setpc_b64 s[30:31]
2830 ; GFX7-LABEL: test_arg_store:
2832 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2833 ; GFX7-NEXT: s_mov_b32 s6, 0
2834 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2835 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2836 ; GFX7-NEXT: s_mov_b32 s4, s6
2837 ; GFX7-NEXT: s_mov_b32 s5, s6
2838 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2839 ; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
2840 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2841 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2843 ; GFX8-LABEL: test_arg_store:
2845 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2846 ; GFX8-NEXT: flat_store_short v[1:2], v0
2847 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2848 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2850 ; GFX9-LABEL: test_arg_store:
2852 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2853 ; GFX9-NEXT: global_store_short v[1:2], v0, off
2854 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2855 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2857 ; GFX10-LABEL: test_arg_store:
2859 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2860 ; GFX10-NEXT: global_store_short v[1:2], v0, off
2861 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2863 ; GFX11-LABEL: test_arg_store:
2865 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2866 ; GFX11-NEXT: global_store_b16 v[1:2], v0, off
2867 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2868 store bfloat %in, ptr addrspace(1) %out
2872 define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
2873 ; GCN-LABEL: test_arg_store_v2bf16:
2875 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2876 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
2877 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2878 ; GCN-NEXT: s_mov_b32 s6, 0
2879 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2880 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
2881 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2882 ; GCN-NEXT: s_mov_b32 s4, s6
2883 ; GCN-NEXT: s_mov_b32 s5, s6
2884 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2885 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2886 ; GCN-NEXT: s_setpc_b64 s[30:31]
2888 ; GFX7-LABEL: test_arg_store_v2bf16:
2890 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
2892 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2893 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2894 ; GFX7-NEXT: s_mov_b32 s6, 0
2895 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
2896 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2897 ; GFX7-NEXT: s_mov_b32 s4, s6
2898 ; GFX7-NEXT: s_mov_b32 s5, s6
2899 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2900 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2901 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2903 ; GFX8-LABEL: test_arg_store_v2bf16:
2905 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2906 ; GFX8-NEXT: flat_store_dword v[1:2], v0
2907 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2908 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2910 ; GFX9-LABEL: test_arg_store_v2bf16:
2912 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2913 ; GFX9-NEXT: global_store_dword v[1:2], v0, off
2914 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2915 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2917 ; GFX10-LABEL: test_arg_store_v2bf16:
2919 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2920 ; GFX10-NEXT: global_store_dword v[1:2], v0, off
2921 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2923 ; GFX11-LABEL: test_arg_store_v2bf16:
2925 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2926 ; GFX11-NEXT: global_store_b32 v[1:2], v0, off
2927 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2928 store <2 x bfloat> %in, ptr addrspace(1) %out
2932 define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
2933 ; GCN-LABEL: test_arg_store_v3bf16:
2935 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2936 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
2937 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2938 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2939 ; GCN-NEXT: s_mov_b32 s6, 0
2940 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
2941 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2942 ; GCN-NEXT: s_mov_b32 s4, s6
2943 ; GCN-NEXT: s_mov_b32 s5, s6
2944 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2945 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
2946 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
2947 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
2948 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2949 ; GCN-NEXT: s_setpc_b64 s[30:31]
2951 ; GFX7-LABEL: test_arg_store_v3bf16:
2953 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2954 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
2955 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2956 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2957 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
2958 ; GFX7-NEXT: s_mov_b32 s6, 0
2959 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
2960 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2961 ; GFX7-NEXT: s_mov_b32 s4, s6
2962 ; GFX7-NEXT: s_mov_b32 s5, s6
2963 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2964 ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
2965 ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
2966 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2967 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2969 ; GFX8-LABEL: test_arg_store_v3bf16:
2971 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2972 ; GFX8-NEXT: flat_store_dword v[2:3], v0
2973 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
2974 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2975 ; GFX8-NEXT: flat_store_short v[2:3], v1
2976 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2977 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2979 ; GFX9-LABEL: test_arg_store_v3bf16:
2981 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2982 ; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
2983 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
2984 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2985 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2987 ; GFX10-LABEL: test_arg_store_v3bf16:
2989 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990 ; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
2991 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
2992 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2994 ; GFX11-LABEL: test_arg_store_v3bf16:
2996 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2997 ; GFX11-NEXT: s_clause 0x1
2998 ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
2999 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
3000 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3001 store <3 x bfloat> %in, ptr addrspace(1) %out
3005 define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
3006 ; GCN-LABEL: test_arg_store_v4bf16:
3008 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3009 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
3010 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
3011 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3012 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3013 ; GCN-NEXT: s_mov_b32 s6, 0
3014 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3015 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
3016 ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
3017 ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
3018 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3019 ; GCN-NEXT: s_mov_b32 s4, s6
3020 ; GCN-NEXT: s_mov_b32 s5, s6
3021 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
3022 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3023 ; GCN-NEXT: s_setpc_b64 s[30:31]
3025 ; GFX7-LABEL: test_arg_store_v4bf16:
3027 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3028 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
3029 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3030 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3031 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
3032 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3033 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3034 ; GFX7-NEXT: s_mov_b32 s6, 0
3035 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
3036 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
3037 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3038 ; GFX7-NEXT: s_mov_b32 s4, s6
3039 ; GFX7-NEXT: s_mov_b32 s5, s6
3040 ; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
3041 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3042 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3044 ; GFX8-LABEL: test_arg_store_v4bf16:
3046 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3047 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
3048 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3049 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3051 ; GFX9-LABEL: test_arg_store_v4bf16:
3053 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3054 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
3055 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3056 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3058 ; GFX10-LABEL: test_arg_store_v4bf16:
3060 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3061 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
3062 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3064 ; GFX11-LABEL: test_arg_store_v4bf16:
3066 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3067 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
3068 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3069 store <4 x bfloat> %in, ptr addrspace(1) %out
3073 define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
3074 ; GCN-LABEL: test_arg_store_v8bf16:
3076 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3077 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3078 ; GCN-NEXT: s_mov_b32 s6, 0
3079 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
3080 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
3081 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
3082 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
3083 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
3084 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
3085 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3086 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3087 ; GCN-NEXT: s_mov_b32 s4, s6
3088 ; GCN-NEXT: s_mov_b32 s5, s6
3089 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
3090 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3091 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
3092 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
3093 ; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
3094 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
3095 ; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
3096 ; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
3097 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
3098 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3099 ; GCN-NEXT: s_setpc_b64 s[30:31]
3101 ; GFX7-LABEL: test_arg_store_v8bf16:
3103 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3104 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
3105 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
3106 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
3107 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3108 ; GFX7-NEXT: s_mov_b32 s6, 0
3109 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
3110 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
3111 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3112 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
3113 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3114 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
3115 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3116 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3117 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3118 ; GFX7-NEXT: s_mov_b32 s4, s6
3119 ; GFX7-NEXT: s_mov_b32 s5, s6
3120 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
3121 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
3122 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
3123 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
3124 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
3125 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3126 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3128 ; GFX8-LABEL: test_arg_store_v8bf16:
3130 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3131 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3132 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3133 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3135 ; GFX9-LABEL: test_arg_store_v8bf16:
3137 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3138 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3139 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3140 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3142 ; GFX10-LABEL: test_arg_store_v8bf16:
3144 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3145 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3146 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3148 ; GFX11-LABEL: test_arg_store_v8bf16:
3150 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3151 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
3152 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3153 store <8 x bfloat> %in, ptr addrspace(1) %out
3157 define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
3158 ; GCN-LABEL: test_arg_store_v16bf16:
3160 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3161 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
3162 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
3163 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
3164 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
3165 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
3166 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
3167 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3168 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3169 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3170 ; GCN-NEXT: s_mov_b32 s6, 0
3171 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
3172 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
3173 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
3174 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
3175 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
3176 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
3177 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
3178 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
3179 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
3180 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3181 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
3182 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
3183 ; GCN-NEXT: s_mov_b32 s4, s6
3184 ; GCN-NEXT: s_mov_b32 s5, s6
3185 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
3186 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
3187 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
3188 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
3189 ; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
3190 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
3191 ; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
3192 ; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
3193 ; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
3194 ; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
3195 ; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
3196 ; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
3197 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
3198 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
3199 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3200 ; GCN-NEXT: s_setpc_b64 s[30:31]
3202 ; GFX7-LABEL: test_arg_store_v16bf16:
3204 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3205 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
3206 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
3207 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3208 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3209 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
3210 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3211 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
3212 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3213 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3214 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
3215 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
3216 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
3217 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
3218 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3219 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
3220 ; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
3221 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
3222 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3223 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
3224 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
3225 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
3226 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3227 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
3228 ; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
3229 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
3230 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
3231 ; GFX7-NEXT: s_mov_b32 s6, 0
3232 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3233 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
3234 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
3235 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
3236 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3237 ; GFX7-NEXT: s_mov_b32 s4, s6
3238 ; GFX7-NEXT: s_mov_b32 s5, s6
3239 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
3240 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
3241 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
3242 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
3243 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3244 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3246 ; GFX8-LABEL: test_arg_store_v16bf16:
3248 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3249 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
3250 ; GFX8-NEXT: s_nop 0
3251 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
3252 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
3253 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
3254 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3255 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3257 ; GFX9-LABEL: test_arg_store_v16bf16:
3259 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3260 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
3261 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
3262 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3263 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3265 ; GFX10-LABEL: test_arg_store_v16bf16:
3267 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3268 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
3269 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
3270 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3272 ; GFX11-LABEL: test_arg_store_v16bf16:
3274 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3275 ; GFX11-NEXT: s_clause 0x1
3276 ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
3277 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
3278 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3279 store <16 x bfloat> %in, ptr addrspace(1) %out
3283 define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
3284 ; GCN-LABEL: test_inreg_arg_store:
3286 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3287 ; GCN-NEXT: s_mov_b32 s39, 0xf000
3288 ; GCN-NEXT: s_mov_b32 s38, 0
3289 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4
3290 ; GCN-NEXT: s_mov_b32 s36, s38
3291 ; GCN-NEXT: s_mov_b32 s37, s38
3292 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
3293 ; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
3294 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3295 ; GCN-NEXT: s_setpc_b64 s[30:31]
3297 ; GFX7-LABEL: test_inreg_arg_store:
3299 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3300 ; GFX7-NEXT: s_mov_b32 s38, 0
3301 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
3302 ; GFX7-NEXT: s_mov_b32 s39, 0xf000
3303 ; GFX7-NEXT: s_mov_b32 s36, s38
3304 ; GFX7-NEXT: s_mov_b32 s37, s38
3305 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
3306 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
3307 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3308 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3310 ; GFX8-LABEL: test_inreg_arg_store:
3312 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3313 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
3314 ; GFX8-NEXT: flat_store_short v[0:1], v2
3315 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3316 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3318 ; GFX9-LABEL: test_inreg_arg_store:
3320 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3321 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
3322 ; GFX9-NEXT: global_store_short v[0:1], v2, off
3323 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3324 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3326 ; GFX10-LABEL: test_inreg_arg_store:
3328 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3329 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
3330 ; GFX10-NEXT: global_store_short v[0:1], v2, off
3331 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3333 ; GFX11-LABEL: test_inreg_arg_store:
3335 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3336 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
3337 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
3338 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3339 store bfloat %in, ptr addrspace(1) %out
3343 define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
3344 ; GCN-LABEL: test_byval:
3346 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3347 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0
3348 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3349 ; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32
3350 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3351 ; GCN-NEXT: s_setpc_b64 s[30:31]
3353 ; GFX7-LABEL: test_byval:
3355 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3356 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0
3357 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3358 ; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32
3359 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3360 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3362 ; GFX8-LABEL: test_byval:
3364 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3365 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
3366 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3367 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3369 ; GFX9-LABEL: test_byval:
3371 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3372 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
3373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3374 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3376 ; GFX10-LABEL: test_byval:
3378 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3379 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
3380 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3382 ; GFX11-LABEL: test_byval:
3384 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3385 ; GFX11-NEXT: scratch_store_b16 off, v0, s32
3386 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3387 store bfloat %val, ptr addrspace(5) %bv
3388 %retval = load bfloat, ptr addrspace(5) %bv
3392 define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
3393 ; GCN-LABEL: test_sret:
3395 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3396 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3397 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3398 ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3399 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3400 ; GCN-NEXT: s_setpc_b64 s[30:31]
3402 ; GFX7-LABEL: test_sret:
3404 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3405 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3406 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3407 ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3408 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3409 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3411 ; GFX8-LABEL: test_sret:
3413 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3414 ; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3415 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3416 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3418 ; GFX9-LABEL: test_sret:
3420 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3421 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3422 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3423 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3425 ; GFX10-LABEL: test_sret:
3427 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3428 ; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3429 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3431 ; GFX11-LABEL: test_sret:
3433 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3434 ; GFX11-NEXT: scratch_store_b16 v0, v1, off
3435 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3436 store bfloat %val, ptr addrspace(5) %sret
3440 define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
3441 ; GCN-LABEL: test_bitcast_from_bfloat:
3443 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3444 ; GCN-NEXT: s_mov_b32 s6, 0
3445 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3446 ; GCN-NEXT: s_mov_b32 s4, s6
3447 ; GCN-NEXT: s_mov_b32 s5, s6
3448 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
3449 ; GCN-NEXT: s_waitcnt vmcnt(0)
3450 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
3451 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3452 ; GCN-NEXT: s_setpc_b64 s[30:31]
3454 ; GFX7-LABEL: test_bitcast_from_bfloat:
3456 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3457 ; GFX7-NEXT: s_mov_b32 s6, 0
3458 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3459 ; GFX7-NEXT: s_mov_b32 s4, s6
3460 ; GFX7-NEXT: s_mov_b32 s5, s6
3461 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
3462 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3463 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
3464 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3465 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3467 ; GFX8-LABEL: test_bitcast_from_bfloat:
3469 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3470 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
3471 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3472 ; GFX8-NEXT: flat_store_short v[2:3], v0
3473 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3474 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3476 ; GFX9-LABEL: test_bitcast_from_bfloat:
3478 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3479 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
3480 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3481 ; GFX9-NEXT: global_store_short v[2:3], v0, off
3482 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3483 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3485 ; GFX10-LABEL: test_bitcast_from_bfloat:
3487 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3488 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
3489 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3490 ; GFX10-NEXT: global_store_short v[2:3], v0, off
3491 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3493 ; GFX11-LABEL: test_bitcast_from_bfloat:
3495 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3496 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
3497 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3498 ; GFX11-NEXT: global_store_b16 v[2:3], v0, off
3499 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3500 %val = load bfloat, ptr addrspace(1) %in
3501 %val_int = bitcast bfloat %val to i16
3502 store i16 %val_int, ptr addrspace(1) %out
3506 define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
3507 ; GCN-LABEL: test_bitcast_to_bfloat:
3509 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3510 ; GCN-NEXT: s_mov_b32 s6, 0
3511 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3512 ; GCN-NEXT: s_mov_b32 s4, s6
3513 ; GCN-NEXT: s_mov_b32 s5, s6
3514 ; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
3515 ; GCN-NEXT: s_waitcnt vmcnt(0)
3516 ; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
3517 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3518 ; GCN-NEXT: s_setpc_b64 s[30:31]
3520 ; GFX7-LABEL: test_bitcast_to_bfloat:
3522 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3523 ; GFX7-NEXT: s_mov_b32 s6, 0
3524 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3525 ; GFX7-NEXT: s_mov_b32 s4, s6
3526 ; GFX7-NEXT: s_mov_b32 s5, s6
3527 ; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
3528 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3529 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
3530 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3531 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3533 ; GFX8-LABEL: test_bitcast_to_bfloat:
3535 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3536 ; GFX8-NEXT: flat_load_ushort v2, v[2:3]
3537 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3538 ; GFX8-NEXT: flat_store_short v[0:1], v2
3539 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3540 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3542 ; GFX9-LABEL: test_bitcast_to_bfloat:
3544 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3545 ; GFX9-NEXT: global_load_ushort v2, v[2:3], off
3546 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3547 ; GFX9-NEXT: global_store_short v[0:1], v2, off
3548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3549 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3551 ; GFX10-LABEL: test_bitcast_to_bfloat:
3553 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3554 ; GFX10-NEXT: global_load_ushort v2, v[2:3], off
3555 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3556 ; GFX10-NEXT: global_store_short v[0:1], v2, off
3557 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3559 ; GFX11-LABEL: test_bitcast_to_bfloat:
3561 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3562 ; GFX11-NEXT: global_load_u16 v2, v[2:3], off
3563 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3564 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
3565 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3566 %val = load i16, ptr addrspace(1) %in
3567 %val_fp = bitcast i16 %val to bfloat
3568 store bfloat %val_fp, ptr addrspace(1) %out
3572 define bfloat @test_ret(bfloat %in) {
3573 ; GCN-LABEL: test_ret:
3574 ; GCN: ; %bb.0: ; %entry
3575 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3576 ; GCN-NEXT: s_setpc_b64 s[30:31]
3578 ; GFX7-LABEL: test_ret:
3579 ; GFX7: ; %bb.0: ; %entry
3580 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3581 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3583 ; GFX8-LABEL: test_ret:
3584 ; GFX8: ; %bb.0: ; %entry
3585 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3586 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3588 ; GFX9-LABEL: test_ret:
3589 ; GFX9: ; %bb.0: ; %entry
3590 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3591 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3593 ; GFX10-LABEL: test_ret:
3594 ; GFX10: ; %bb.0: ; %entry
3595 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3598 ; GFX11-LABEL: test_ret:
3599 ; GFX11: ; %bb.0: ; %entry
3600 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3601 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3606 define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
3607 ; GCN-LABEL: test_ret_v2bf16:
3608 ; GCN: ; %bb.0: ; %entry
3609 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3610 ; GCN-NEXT: s_setpc_b64 s[30:31]
3612 ; GFX7-LABEL: test_ret_v2bf16:
3613 ; GFX7: ; %bb.0: ; %entry
3614 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3615 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3617 ; GFX8-LABEL: test_ret_v2bf16:
3618 ; GFX8: ; %bb.0: ; %entry
3619 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3620 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3622 ; GFX9-LABEL: test_ret_v2bf16:
3623 ; GFX9: ; %bb.0: ; %entry
3624 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3625 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3627 ; GFX10-LABEL: test_ret_v2bf16:
3628 ; GFX10: ; %bb.0: ; %entry
3629 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3630 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3632 ; GFX11-LABEL: test_ret_v2bf16:
3633 ; GFX11: ; %bb.0: ; %entry
3634 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3635 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3637 ret <2 x bfloat> %in
3640 define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
3641 ; GCN-LABEL: test_ret_v3bf16:
3642 ; GCN: ; %bb.0: ; %entry
3643 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3644 ; GCN-NEXT: s_setpc_b64 s[30:31]
3646 ; GFX7-LABEL: test_ret_v3bf16:
3647 ; GFX7: ; %bb.0: ; %entry
3648 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3649 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3651 ; GFX8-LABEL: test_ret_v3bf16:
3652 ; GFX8: ; %bb.0: ; %entry
3653 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3654 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3656 ; GFX9-LABEL: test_ret_v3bf16:
3657 ; GFX9: ; %bb.0: ; %entry
3658 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3659 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3661 ; GFX10-LABEL: test_ret_v3bf16:
3662 ; GFX10: ; %bb.0: ; %entry
3663 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3664 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3666 ; GFX11-LABEL: test_ret_v3bf16:
3667 ; GFX11: ; %bb.0: ; %entry
3668 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3669 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3671 ret <3 x bfloat> %in
3674 define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
3675 ; GCN-LABEL: test_ret_v4bf16:
3676 ; GCN: ; %bb.0: ; %entry
3677 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3678 ; GCN-NEXT: s_setpc_b64 s[30:31]
3680 ; GFX7-LABEL: test_ret_v4bf16:
3681 ; GFX7: ; %bb.0: ; %entry
3682 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3683 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3685 ; GFX8-LABEL: test_ret_v4bf16:
3686 ; GFX8: ; %bb.0: ; %entry
3687 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3688 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3690 ; GFX9-LABEL: test_ret_v4bf16:
3691 ; GFX9: ; %bb.0: ; %entry
3692 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3693 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3695 ; GFX10-LABEL: test_ret_v4bf16:
3696 ; GFX10: ; %bb.0: ; %entry
3697 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3698 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3700 ; GFX11-LABEL: test_ret_v4bf16:
3701 ; GFX11: ; %bb.0: ; %entry
3702 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3703 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3705 ret <4 x bfloat> %in
3708 define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
3709 ; GCN-LABEL: test_ret_v8bf16:
3710 ; GCN: ; %bb.0: ; %entry
3711 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3712 ; GCN-NEXT: s_setpc_b64 s[30:31]
3714 ; GFX7-LABEL: test_ret_v8bf16:
3715 ; GFX7: ; %bb.0: ; %entry
3716 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3717 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3719 ; GFX8-LABEL: test_ret_v8bf16:
3720 ; GFX8: ; %bb.0: ; %entry
3721 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3722 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3724 ; GFX9-LABEL: test_ret_v8bf16:
3725 ; GFX9: ; %bb.0: ; %entry
3726 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3727 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3729 ; GFX10-LABEL: test_ret_v8bf16:
3730 ; GFX10: ; %bb.0: ; %entry
3731 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3732 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3734 ; GFX11-LABEL: test_ret_v8bf16:
3735 ; GFX11: ; %bb.0: ; %entry
3736 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3737 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3739 ret <8 x bfloat> %in
3742 define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
3743 ; GCN-LABEL: test_ret_v16bf16:
3744 ; GCN: ; %bb.0: ; %entry
3745 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3746 ; GCN-NEXT: s_setpc_b64 s[30:31]
3748 ; GFX7-LABEL: test_ret_v16bf16:
3749 ; GFX7: ; %bb.0: ; %entry
3750 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3751 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3753 ; GFX8-LABEL: test_ret_v16bf16:
3754 ; GFX8: ; %bb.0: ; %entry
3755 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3756 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3758 ; GFX9-LABEL: test_ret_v16bf16:
3759 ; GFX9: ; %bb.0: ; %entry
3760 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3761 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3763 ; GFX10-LABEL: test_ret_v16bf16:
3764 ; GFX10: ; %bb.0: ; %entry
3765 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3766 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3768 ; GFX11-LABEL: test_ret_v16bf16:
3769 ; GFX11: ; %bb.0: ; %entry
3770 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3771 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3773 ret <16 x bfloat> %in
3776 define void @test_call(bfloat %in, ptr addrspace(5) %out) {
3777 ; GCN-LABEL: test_call:
3778 ; GCN: ; %bb.0: ; %entry
3779 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3780 ; GCN-NEXT: s_mov_b32 s18, s33
3781 ; GCN-NEXT: s_mov_b32 s33, s32
3782 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
3783 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3784 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
3785 ; GCN-NEXT: s_addk_i32 s32, 0x400
3786 ; GCN-NEXT: s_waitcnt expcnt(0)
3787 ; GCN-NEXT: v_writelane_b32 v2, s30, 0
3788 ; GCN-NEXT: v_writelane_b32 v2, s31, 1
3789 ; GCN-NEXT: s_getpc_b64 s[16:17]
3790 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3791 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3792 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3793 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
3794 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
3795 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3796 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3797 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3798 ; GCN-NEXT: s_waitcnt vmcnt(0)
3799 ; GCN-NEXT: v_readlane_b32 s31, v2, 1
3800 ; GCN-NEXT: v_readlane_b32 s30, v2, 0
3801 ; GCN-NEXT: s_mov_b32 s32, s33
3802 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
3803 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3804 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
3805 ; GCN-NEXT: s_mov_b32 s33, s18
3806 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3807 ; GCN-NEXT: s_setpc_b64 s[30:31]
3809 ; GFX7-LABEL: test_call:
3810 ; GFX7: ; %bb.0: ; %entry
3811 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3812 ; GFX7-NEXT: s_mov_b32 s18, s33
3813 ; GFX7-NEXT: s_mov_b32 s33, s32
3814 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
3815 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3816 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
3817 ; GFX7-NEXT: s_addk_i32 s32, 0x400
3818 ; GFX7-NEXT: s_getpc_b64 s[16:17]
3819 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3820 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3821 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3822 ; GFX7-NEXT: v_writelane_b32 v2, s30, 0
3823 ; GFX7-NEXT: v_writelane_b32 v2, s31, 1
3824 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3825 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
3826 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3827 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3828 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3829 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3830 ; GFX7-NEXT: v_readlane_b32 s31, v2, 1
3831 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0
3832 ; GFX7-NEXT: s_mov_b32 s32, s33
3833 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
3834 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3835 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
3836 ; GFX7-NEXT: s_mov_b32 s33, s18
3837 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3838 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3840 ; GFX8-LABEL: test_call:
3841 ; GFX8: ; %bb.0: ; %entry
3842 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3843 ; GFX8-NEXT: s_mov_b32 s18, s33
3844 ; GFX8-NEXT: s_mov_b32 s33, s32
3845 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
3846 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3847 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
3848 ; GFX8-NEXT: s_addk_i32 s32, 0x400
3849 ; GFX8-NEXT: s_getpc_b64 s[16:17]
3850 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3851 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3852 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3853 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0
3854 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1
3855 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3856 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
3857 ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3858 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3859 ; GFX8-NEXT: v_readlane_b32 s31, v2, 1
3860 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0
3861 ; GFX8-NEXT: s_mov_b32 s32, s33
3862 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
3863 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3864 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
3865 ; GFX8-NEXT: s_mov_b32 s33, s18
3866 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3867 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3869 ; GFX9-LABEL: test_call:
3870 ; GFX9: ; %bb.0: ; %entry
3871 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3872 ; GFX9-NEXT: s_mov_b32 s18, s33
3873 ; GFX9-NEXT: s_mov_b32 s33, s32
3874 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
3875 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3876 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
3877 ; GFX9-NEXT: s_addk_i32 s32, 0x400
3878 ; GFX9-NEXT: s_getpc_b64 s[16:17]
3879 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3880 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3881 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3882 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0
3883 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1
3884 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3885 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
3886 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3887 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3888 ; GFX9-NEXT: v_readlane_b32 s31, v2, 1
3889 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0
3890 ; GFX9-NEXT: s_mov_b32 s32, s33
3891 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
3892 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3893 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
3894 ; GFX9-NEXT: s_mov_b32 s33, s18
3895 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3896 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3898 ; GFX10-LABEL: test_call:
3899 ; GFX10: ; %bb.0: ; %entry
3900 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3901 ; GFX10-NEXT: s_mov_b32 s18, s33
3902 ; GFX10-NEXT: s_mov_b32 s33, s32
3903 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
3904 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3905 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
3906 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
3907 ; GFX10-NEXT: s_addk_i32 s32, 0x200
3908 ; GFX10-NEXT: s_getpc_b64 s[16:17]
3909 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3910 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3911 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0
3912 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3913 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1
3914 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3915 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
3916 ; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3917 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3918 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1
3919 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0
3920 ; GFX10-NEXT: s_mov_b32 s32, s33
3921 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
3922 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3923 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
3924 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
3925 ; GFX10-NEXT: s_mov_b32 s33, s18
3926 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3927 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3929 ; GFX11-LABEL: test_call:
3930 ; GFX11: ; %bb.0: ; %entry
3931 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3932 ; GFX11-NEXT: s_mov_b32 s2, s33
3933 ; GFX11-NEXT: s_mov_b32 s33, s32
3934 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
3935 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
3936 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
3937 ; GFX11-NEXT: s_add_i32 s32, s32, 16
3938 ; GFX11-NEXT: s_getpc_b64 s[0:1]
3939 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
3940 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
3941 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0
3942 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3943 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1
3944 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3945 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
3946 ; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc
3947 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3948 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1
3949 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0
3950 ; GFX11-NEXT: s_mov_b32 s32, s33
3951 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
3952 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
3953 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
3954 ; GFX11-NEXT: s_mov_b32 s33, s2
3955 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3956 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3958 %result = call bfloat @test_arg_store(bfloat %in)
3959 store volatile bfloat %result, ptr addrspace(5) %out
3963 define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
3964 ; GCN-LABEL: test_call_v2bf16:
3965 ; GCN: ; %bb.0: ; %entry
3966 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3967 ; GCN-NEXT: s_mov_b32 s18, s33
3968 ; GCN-NEXT: s_mov_b32 s33, s32
3969 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
3970 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
3971 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
3972 ; GCN-NEXT: s_addk_i32 s32, 0x400
3973 ; GCN-NEXT: s_waitcnt expcnt(0)
3974 ; GCN-NEXT: v_writelane_b32 v4, s30, 0
3975 ; GCN-NEXT: v_writelane_b32 v4, s31, 1
3976 ; GCN-NEXT: s_getpc_b64 s[16:17]
3977 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
3978 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
3979 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3980 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
3981 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
3982 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3983 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3984 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2
3985 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3986 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3987 ; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
3988 ; GCN-NEXT: s_waitcnt vmcnt(0)
3989 ; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
3990 ; GCN-NEXT: s_waitcnt vmcnt(0)
3991 ; GCN-NEXT: v_readlane_b32 s31, v4, 1
3992 ; GCN-NEXT: v_readlane_b32 s30, v4, 0
3993 ; GCN-NEXT: s_mov_b32 s32, s33
3994 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
3995 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
3996 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
3997 ; GCN-NEXT: s_mov_b32 s33, s18
3998 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3999 ; GCN-NEXT: s_setpc_b64 s[30:31]
4001 ; GFX7-LABEL: test_call_v2bf16:
4002 ; GFX7: ; %bb.0: ; %entry
4003 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4004 ; GFX7-NEXT: s_mov_b32 s18, s33
4005 ; GFX7-NEXT: s_mov_b32 s33, s32
4006 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4007 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4008 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4009 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4010 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4011 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4012 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4013 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4014 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0
4015 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1
4016 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4017 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4018 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4019 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4020 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4021 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2
4022 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4023 ; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
4024 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4025 ; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
4026 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4027 ; GFX7-NEXT: v_readlane_b32 s31, v4, 1
4028 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0
4029 ; GFX7-NEXT: s_mov_b32 s32, s33
4030 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4031 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4032 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4033 ; GFX7-NEXT: s_mov_b32 s33, s18
4034 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4035 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4037 ; GFX8-LABEL: test_call_v2bf16:
4038 ; GFX8: ; %bb.0: ; %entry
4039 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4040 ; GFX8-NEXT: s_mov_b32 s18, s33
4041 ; GFX8-NEXT: s_mov_b32 s33, s32
4042 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4043 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4044 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4045 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4046 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4047 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4048 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4049 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4050 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0
4051 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1
4052 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4053 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4054 ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
4055 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4056 ; GFX8-NEXT: v_readlane_b32 s31, v2, 1
4057 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0
4058 ; GFX8-NEXT: s_mov_b32 s32, s33
4059 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4060 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4061 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4062 ; GFX8-NEXT: s_mov_b32 s33, s18
4063 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4064 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4066 ; GFX9-LABEL: test_call_v2bf16:
4067 ; GFX9: ; %bb.0: ; %entry
4068 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4069 ; GFX9-NEXT: s_mov_b32 s18, s33
4070 ; GFX9-NEXT: s_mov_b32 s33, s32
4071 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4072 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4073 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4074 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4075 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4076 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4077 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4078 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4079 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0
4080 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1
4081 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4082 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4083 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
4084 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4085 ; GFX9-NEXT: v_readlane_b32 s31, v2, 1
4086 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0
4087 ; GFX9-NEXT: s_mov_b32 s32, s33
4088 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4089 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4090 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4091 ; GFX9-NEXT: s_mov_b32 s33, s18
4092 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4093 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4095 ; GFX10-LABEL: test_call_v2bf16:
4096 ; GFX10: ; %bb.0: ; %entry
4097 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4098 ; GFX10-NEXT: s_mov_b32 s18, s33
4099 ; GFX10-NEXT: s_mov_b32 s33, s32
4100 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4101 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4102 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4103 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4104 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4105 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4106 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4107 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4108 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0
4109 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4110 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1
4111 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4112 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4113 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
4114 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4115 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1
4116 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0
4117 ; GFX10-NEXT: s_mov_b32 s32, s33
4118 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4119 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4120 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4121 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4122 ; GFX10-NEXT: s_mov_b32 s33, s18
4123 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4124 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4126 ; GFX11-LABEL: test_call_v2bf16:
4127 ; GFX11: ; %bb.0: ; %entry
4128 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4129 ; GFX11-NEXT: s_mov_b32 s2, s33
4130 ; GFX11-NEXT: s_mov_b32 s33, s32
4131 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4132 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
4133 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4134 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4135 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4136 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4137 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4138 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0
4139 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4140 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1
4141 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4142 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4143 ; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc
4144 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4145 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1
4146 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0
4147 ; GFX11-NEXT: s_mov_b32 s32, s33
4148 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4149 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
4150 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4151 ; GFX11-NEXT: s_mov_b32 s33, s2
4152 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4153 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4155 %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
4156 store volatile <2 x bfloat> %result, ptr addrspace(5) %out
4160 define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
4161 ; GCN-LABEL: test_call_v3bf16:
4162 ; GCN: ; %bb.0: ; %entry
4163 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4164 ; GCN-NEXT: s_mov_b32 s18, s33
4165 ; GCN-NEXT: s_mov_b32 s33, s32
4166 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4167 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4168 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4169 ; GCN-NEXT: s_addk_i32 s32, 0x400
4170 ; GCN-NEXT: s_waitcnt expcnt(0)
4171 ; GCN-NEXT: v_writelane_b32 v5, s30, 0
4172 ; GCN-NEXT: v_writelane_b32 v5, s31, 1
4173 ; GCN-NEXT: s_getpc_b64 s[16:17]
4174 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4175 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4176 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4177 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4178 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4179 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4180 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4181 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4182 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3
4183 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4184 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4185 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
4186 ; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen
4187 ; GCN-NEXT: s_waitcnt vmcnt(0)
4188 ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
4189 ; GCN-NEXT: s_waitcnt vmcnt(0)
4190 ; GCN-NEXT: v_readlane_b32 s31, v5, 1
4191 ; GCN-NEXT: v_readlane_b32 s30, v5, 0
4192 ; GCN-NEXT: s_mov_b32 s32, s33
4193 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4194 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4195 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4196 ; GCN-NEXT: s_mov_b32 s33, s18
4197 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4198 ; GCN-NEXT: s_setpc_b64 s[30:31]
4200 ; GFX7-LABEL: test_call_v3bf16:
4201 ; GFX7: ; %bb.0: ; %entry
4202 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4203 ; GFX7-NEXT: s_mov_b32 s18, s33
4204 ; GFX7-NEXT: s_mov_b32 s33, s32
4205 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4206 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4207 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4208 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4209 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4210 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4211 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4212 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4213 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0
4214 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1
4215 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4216 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4217 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4218 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4219 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4220 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
4221 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
4222 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4223 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3
4224 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
4225 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4226 ; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
4227 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4228 ; GFX7-NEXT: v_readlane_b32 s31, v4, 1
4229 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0
4230 ; GFX7-NEXT: s_mov_b32 s32, s33
4231 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4232 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4233 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4234 ; GFX7-NEXT: s_mov_b32 s33, s18
4235 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4236 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4238 ; GFX8-LABEL: test_call_v3bf16:
4239 ; GFX8: ; %bb.0: ; %entry
4240 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4241 ; GFX8-NEXT: s_mov_b32 s18, s33
4242 ; GFX8-NEXT: s_mov_b32 s33, s32
4243 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4244 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4245 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4246 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4247 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4248 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4249 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4250 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4251 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0
4252 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1
4253 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4254 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4255 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
4256 ; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
4257 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4258 ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4259 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4260 ; GFX8-NEXT: v_readlane_b32 s31, v4, 1
4261 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0
4262 ; GFX8-NEXT: s_mov_b32 s32, s33
4263 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4264 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4265 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4266 ; GFX8-NEXT: s_mov_b32 s33, s18
4267 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4268 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4270 ; GFX9-LABEL: test_call_v3bf16:
4271 ; GFX9: ; %bb.0: ; %entry
4272 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4273 ; GFX9-NEXT: s_mov_b32 s18, s33
4274 ; GFX9-NEXT: s_mov_b32 s33, s32
4275 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4276 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4277 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4278 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4279 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4280 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4281 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4282 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4283 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0
4284 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1
4285 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4286 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4287 ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
4288 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4289 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4290 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4291 ; GFX9-NEXT: v_readlane_b32 s31, v3, 1
4292 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0
4293 ; GFX9-NEXT: s_mov_b32 s32, s33
4294 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4295 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4296 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4297 ; GFX9-NEXT: s_mov_b32 s33, s18
4298 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4299 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4301 ; GFX10-LABEL: test_call_v3bf16:
4302 ; GFX10: ; %bb.0: ; %entry
4303 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4304 ; GFX10-NEXT: s_mov_b32 s18, s33
4305 ; GFX10-NEXT: s_mov_b32 s33, s32
4306 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4307 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4308 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4309 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4310 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4311 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4312 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4313 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4314 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0
4315 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4316 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1
4317 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4318 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4319 ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
4320 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4321 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4322 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4323 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1
4324 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0
4325 ; GFX10-NEXT: s_mov_b32 s32, s33
4326 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4327 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4328 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4329 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4330 ; GFX10-NEXT: s_mov_b32 s33, s18
4331 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4332 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4334 ; GFX11-LABEL: test_call_v3bf16:
4335 ; GFX11: ; %bb.0: ; %entry
4336 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4337 ; GFX11-NEXT: s_mov_b32 s2, s33
4338 ; GFX11-NEXT: s_mov_b32 s33, s32
4339 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4340 ; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
4341 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4342 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4343 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4344 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4345 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4346 ; GFX11-NEXT: v_writelane_b32 v3, s30, 0
4347 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4348 ; GFX11-NEXT: v_writelane_b32 v3, s31, 1
4349 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4350 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4351 ; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc
4352 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4353 ; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc
4354 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4355 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1
4356 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0
4357 ; GFX11-NEXT: s_mov_b32 s32, s33
4358 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4359 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
4360 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4361 ; GFX11-NEXT: s_mov_b32 s33, s2
4362 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4363 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4365 %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
4366 store volatile <3 x bfloat> %result, ptr addrspace(5) %out
4370 define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
4371 ; GCN-LABEL: test_call_v4bf16:
4372 ; GCN: ; %bb.0: ; %entry
4373 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4374 ; GCN-NEXT: s_mov_b32 s18, s33
4375 ; GCN-NEXT: s_mov_b32 s33, s32
4376 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4377 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
4378 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4379 ; GCN-NEXT: s_addk_i32 s32, 0x400
4380 ; GCN-NEXT: s_waitcnt expcnt(0)
4381 ; GCN-NEXT: v_writelane_b32 v8, s30, 0
4382 ; GCN-NEXT: v_writelane_b32 v8, s31, 1
4383 ; GCN-NEXT: s_getpc_b64 s[16:17]
4384 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4385 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4386 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4387 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4388 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4389 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4390 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4391 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4392 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
4393 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4
4394 ; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4
4395 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4
4396 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4397 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4398 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4399 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4400 ; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen
4401 ; GCN-NEXT: s_waitcnt vmcnt(0)
4402 ; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen
4403 ; GCN-NEXT: s_waitcnt vmcnt(0)
4404 ; GCN-NEXT: buffer_store_short v1, v7, s[0:3], 0 offen
4405 ; GCN-NEXT: s_waitcnt vmcnt(0)
4406 ; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
4407 ; GCN-NEXT: s_waitcnt vmcnt(0)
4408 ; GCN-NEXT: v_readlane_b32 s31, v8, 1
4409 ; GCN-NEXT: v_readlane_b32 s30, v8, 0
4410 ; GCN-NEXT: s_mov_b32 s32, s33
4411 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4412 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
4413 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4414 ; GCN-NEXT: s_mov_b32 s33, s18
4415 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4416 ; GCN-NEXT: s_setpc_b64 s[30:31]
4418 ; GFX7-LABEL: test_call_v4bf16:
4419 ; GFX7: ; %bb.0: ; %entry
4420 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4421 ; GFX7-NEXT: s_mov_b32 s18, s33
4422 ; GFX7-NEXT: s_mov_b32 s33, s32
4423 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4424 ; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
4425 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4426 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4427 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4428 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4429 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4430 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4431 ; GFX7-NEXT: v_writelane_b32 v6, s30, 0
4432 ; GFX7-NEXT: v_writelane_b32 v6, s31, 1
4433 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4434 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4435 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
4436 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
4437 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4438 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4
4439 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4440 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4441 ; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen
4442 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4443 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4
4444 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4445 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4446 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
4447 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4448 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4
4449 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4450 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
4451 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4452 ; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
4453 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4454 ; GFX7-NEXT: v_readlane_b32 s31, v6, 1
4455 ; GFX7-NEXT: v_readlane_b32 s30, v6, 0
4456 ; GFX7-NEXT: s_mov_b32 s32, s33
4457 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4458 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
4459 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4460 ; GFX7-NEXT: s_mov_b32 s33, s18
4461 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4462 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4464 ; GFX8-LABEL: test_call_v4bf16:
4465 ; GFX8: ; %bb.0: ; %entry
4466 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4467 ; GFX8-NEXT: s_mov_b32 s18, s33
4468 ; GFX8-NEXT: s_mov_b32 s33, s32
4469 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4470 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4471 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4472 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4473 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4474 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4475 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4476 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4477 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0
4478 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1
4479 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4480 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4481 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
4482 ; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
4483 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4484 ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4485 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4486 ; GFX8-NEXT: v_readlane_b32 s31, v4, 1
4487 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0
4488 ; GFX8-NEXT: s_mov_b32 s32, s33
4489 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4490 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4491 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4492 ; GFX8-NEXT: s_mov_b32 s33, s18
4493 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4494 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4496 ; GFX9-LABEL: test_call_v4bf16:
4497 ; GFX9: ; %bb.0: ; %entry
4498 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4499 ; GFX9-NEXT: s_mov_b32 s18, s33
4500 ; GFX9-NEXT: s_mov_b32 s33, s32
4501 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4502 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4503 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4504 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4505 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4506 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4507 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4508 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4509 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0
4510 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1
4511 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4512 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4513 ; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4514 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4515 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4516 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4517 ; GFX9-NEXT: v_readlane_b32 s31, v3, 1
4518 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0
4519 ; GFX9-NEXT: s_mov_b32 s32, s33
4520 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4521 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4522 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4523 ; GFX9-NEXT: s_mov_b32 s33, s18
4524 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4525 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4527 ; GFX10-LABEL: test_call_v4bf16:
4528 ; GFX10: ; %bb.0: ; %entry
4529 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4530 ; GFX10-NEXT: s_mov_b32 s18, s33
4531 ; GFX10-NEXT: s_mov_b32 s33, s32
4532 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4533 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4534 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4535 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4536 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4537 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4538 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4539 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4540 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0
4541 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4542 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1
4543 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4544 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4545 ; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4546 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4547 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4548 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4549 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1
4550 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0
4551 ; GFX10-NEXT: s_mov_b32 s32, s33
4552 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4553 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4554 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4555 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4556 ; GFX10-NEXT: s_mov_b32 s33, s18
4557 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4558 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4560 ; GFX11-LABEL: test_call_v4bf16:
4561 ; GFX11: ; %bb.0: ; %entry
4562 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4563 ; GFX11-NEXT: s_mov_b32 s2, s33
4564 ; GFX11-NEXT: s_mov_b32 s33, s32
4565 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4566 ; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
4567 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4568 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4569 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4570 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4571 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4572 ; GFX11-NEXT: v_writelane_b32 v3, s30, 0
4573 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4574 ; GFX11-NEXT: v_writelane_b32 v3, s31, 1
4575 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4576 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4577 ; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc
4578 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4579 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1
4580 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0
4581 ; GFX11-NEXT: s_mov_b32 s32, s33
4582 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4583 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
4584 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4585 ; GFX11-NEXT: s_mov_b32 s33, s2
4586 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4587 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4589 %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
4590 store volatile <4 x bfloat> %result, ptr addrspace(5) %out
4594 define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
4595 ; GCN-LABEL: test_call_v8bf16:
4596 ; GCN: ; %bb.0: ; %entry
4597 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4598 ; GCN-NEXT: s_mov_b32 s18, s33
4599 ; GCN-NEXT: s_mov_b32 s33, s32
4600 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4601 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill
4602 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4603 ; GCN-NEXT: s_addk_i32 s32, 0x400
4604 ; GCN-NEXT: s_waitcnt expcnt(0)
4605 ; GCN-NEXT: v_writelane_b32 v16, s30, 0
4606 ; GCN-NEXT: v_writelane_b32 v16, s31, 1
4607 ; GCN-NEXT: s_getpc_b64 s[16:17]
4608 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4609 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4610 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4611 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4612 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4613 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4614 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4615 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4616 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
4617 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
4618 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
4619 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
4620 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
4621 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8
4622 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8
4623 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8
4624 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8
4625 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8
4626 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8
4627 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8
4628 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4629 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4630 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4631 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4632 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4633 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4634 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
4635 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
4636 ; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen
4637 ; GCN-NEXT: s_waitcnt vmcnt(0)
4638 ; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen
4639 ; GCN-NEXT: s_waitcnt vmcnt(0)
4640 ; GCN-NEXT: buffer_store_short v5, v11, s[0:3], 0 offen
4641 ; GCN-NEXT: s_waitcnt vmcnt(0)
4642 ; GCN-NEXT: buffer_store_short v4, v12, s[0:3], 0 offen
4643 ; GCN-NEXT: s_waitcnt vmcnt(0)
4644 ; GCN-NEXT: buffer_store_short v3, v13, s[0:3], 0 offen
4645 ; GCN-NEXT: s_waitcnt vmcnt(0)
4646 ; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen
4647 ; GCN-NEXT: s_waitcnt vmcnt(0)
4648 ; GCN-NEXT: buffer_store_short v1, v15, s[0:3], 0 offen
4649 ; GCN-NEXT: s_waitcnt vmcnt(0)
4650 ; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
4651 ; GCN-NEXT: s_waitcnt vmcnt(0)
4652 ; GCN-NEXT: v_readlane_b32 s31, v16, 1
4653 ; GCN-NEXT: v_readlane_b32 s30, v16, 0
4654 ; GCN-NEXT: s_mov_b32 s32, s33
4655 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4656 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
4657 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4658 ; GCN-NEXT: s_mov_b32 s33, s18
4659 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4660 ; GCN-NEXT: s_setpc_b64 s[30:31]
4662 ; GFX7-LABEL: test_call_v8bf16:
4663 ; GFX7: ; %bb.0: ; %entry
4664 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4665 ; GFX7-NEXT: s_mov_b32 s18, s33
4666 ; GFX7-NEXT: s_mov_b32 s33, s32
4667 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4668 ; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
4669 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4670 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4671 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4672 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4673 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4674 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4675 ; GFX7-NEXT: v_writelane_b32 v10, s30, 0
4676 ; GFX7-NEXT: v_writelane_b32 v10, s31, 1
4677 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4678 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4679 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
4680 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
4681 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
4682 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8
4683 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
4684 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
4685 ; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen
4686 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4687 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8
4688 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
4689 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4690 ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen
4691 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4692 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8
4693 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
4694 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4695 ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen
4696 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4697 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8
4698 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
4699 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4700 ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen
4701 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4702 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8
4703 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4704 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4705 ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen
4706 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4707 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8
4708 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4709 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4710 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
4711 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4712 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8
4713 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4714 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
4715 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4716 ; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
4717 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4718 ; GFX7-NEXT: v_readlane_b32 s31, v10, 1
4719 ; GFX7-NEXT: v_readlane_b32 s30, v10, 0
4720 ; GFX7-NEXT: s_mov_b32 s32, s33
4721 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4722 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
4723 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4724 ; GFX7-NEXT: s_mov_b32 s33, s18
4725 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4726 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4728 ; GFX8-LABEL: test_call_v8bf16:
4729 ; GFX8: ; %bb.0: ; %entry
4730 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4731 ; GFX8-NEXT: s_mov_b32 s18, s33
4732 ; GFX8-NEXT: s_mov_b32 s33, s32
4733 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4734 ; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
4735 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4736 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4737 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4738 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4739 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4740 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4741 ; GFX8-NEXT: v_writelane_b32 v6, s30, 0
4742 ; GFX8-NEXT: v_writelane_b32 v6, s31, 1
4743 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4744 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4745 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4
4746 ; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
4747 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4748 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4
4749 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
4750 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4751 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
4752 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
4753 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4754 ; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
4755 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4756 ; GFX8-NEXT: v_readlane_b32 s31, v6, 1
4757 ; GFX8-NEXT: v_readlane_b32 s30, v6, 0
4758 ; GFX8-NEXT: s_mov_b32 s32, s33
4759 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4760 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
4761 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4762 ; GFX8-NEXT: s_mov_b32 s33, s18
4763 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4764 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4766 ; GFX9-LABEL: test_call_v8bf16:
4767 ; GFX9: ; %bb.0: ; %entry
4768 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4769 ; GFX9-NEXT: s_mov_b32 s18, s33
4770 ; GFX9-NEXT: s_mov_b32 s33, s32
4771 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4772 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4773 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4774 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4775 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4776 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4777 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4778 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4779 ; GFX9-NEXT: v_writelane_b32 v5, s30, 0
4780 ; GFX9-NEXT: v_writelane_b32 v5, s31, 1
4781 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4782 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4783 ; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
4784 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4785 ; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
4786 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4787 ; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4788 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4789 ; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
4790 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4791 ; GFX9-NEXT: v_readlane_b32 s31, v5, 1
4792 ; GFX9-NEXT: v_readlane_b32 s30, v5, 0
4793 ; GFX9-NEXT: s_mov_b32 s32, s33
4794 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4795 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4796 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4797 ; GFX9-NEXT: s_mov_b32 s33, s18
4798 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4799 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4801 ; GFX10-LABEL: test_call_v8bf16:
4802 ; GFX10: ; %bb.0: ; %entry
4803 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4804 ; GFX10-NEXT: s_mov_b32 s18, s33
4805 ; GFX10-NEXT: s_mov_b32 s33, s32
4806 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4807 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4808 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4809 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4810 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4811 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4812 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4813 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4814 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0
4815 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4816 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1
4817 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4818 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4819 ; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
4820 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4821 ; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
4822 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4823 ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4824 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4825 ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
4826 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4827 ; GFX10-NEXT: v_readlane_b32 s31, v5, 1
4828 ; GFX10-NEXT: v_readlane_b32 s30, v5, 0
4829 ; GFX10-NEXT: s_mov_b32 s32, s33
4830 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4831 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4832 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4833 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4834 ; GFX10-NEXT: s_mov_b32 s33, s18
4835 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4836 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4838 ; GFX11-LABEL: test_call_v8bf16:
4839 ; GFX11: ; %bb.0: ; %entry
4840 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4841 ; GFX11-NEXT: s_mov_b32 s2, s33
4842 ; GFX11-NEXT: s_mov_b32 s33, s32
4843 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4844 ; GFX11-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
4845 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4846 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4847 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4848 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4849 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4850 ; GFX11-NEXT: v_writelane_b32 v5, s30, 0
4851 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4852 ; GFX11-NEXT: v_writelane_b32 v5, s31, 1
4853 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4854 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4855 ; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
4856 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4857 ; GFX11-NEXT: v_readlane_b32 s31, v5, 1
4858 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0
4859 ; GFX11-NEXT: s_mov_b32 s32, s33
4860 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4861 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
4862 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4863 ; GFX11-NEXT: s_mov_b32 s33, s2
4864 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4865 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4867 %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
4868 store volatile <8 x bfloat> %result, ptr addrspace(5) %out
4872 define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
4873 ; GCN-LABEL: test_call_v16bf16:
4874 ; GCN: ; %bb.0: ; %entry
4875 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4876 ; GCN-NEXT: s_mov_b32 s18, s33
4877 ; GCN-NEXT: s_mov_b32 s33, s32
4878 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4879 ; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill
4880 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4881 ; GCN-NEXT: s_addk_i32 s32, 0x400
4882 ; GCN-NEXT: s_waitcnt expcnt(0)
4883 ; GCN-NEXT: v_writelane_b32 v20, s30, 0
4884 ; GCN-NEXT: v_writelane_b32 v20, s31, 1
4885 ; GCN-NEXT: s_getpc_b64 s[16:17]
4886 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4887 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4888 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4889 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4890 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4891 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4892 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4893 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4894 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
4895 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
4896 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
4897 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
4898 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
4899 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
4900 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
4901 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
4902 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
4903 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
4904 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
4905 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
4906 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
4907 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16
4908 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16
4909 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16
4910 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
4911 ; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
4912 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4913 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16
4914 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16
4915 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
4916 ; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen
4917 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4918 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16
4919 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16
4920 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
4921 ; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen
4922 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4923 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16
4924 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16
4925 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
4926 ; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen
4927 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4928 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16
4929 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16
4930 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
4931 ; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen
4932 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4933 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16
4934 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16
4935 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
4936 ; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen
4937 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4938 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16
4939 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16
4940 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4941 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4942 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4943 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4944 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4945 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4946 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
4947 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
4948 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
4949 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
4950 ; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen
4951 ; GCN-NEXT: s_waitcnt vmcnt(0)
4952 ; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen
4953 ; GCN-NEXT: s_waitcnt vmcnt(0)
4954 ; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen
4955 ; GCN-NEXT: s_waitcnt vmcnt(0)
4956 ; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen
4957 ; GCN-NEXT: s_waitcnt vmcnt(0)
4958 ; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen
4959 ; GCN-NEXT: s_waitcnt vmcnt(0)
4960 ; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen
4961 ; GCN-NEXT: s_waitcnt vmcnt(0)
4962 ; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen
4963 ; GCN-NEXT: s_waitcnt vmcnt(0)
4964 ; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen
4965 ; GCN-NEXT: s_waitcnt vmcnt(0)
4966 ; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
4967 ; GCN-NEXT: s_waitcnt vmcnt(0)
4968 ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
4969 ; GCN-NEXT: s_waitcnt vmcnt(0)
4970 ; GCN-NEXT: v_readlane_b32 s31, v20, 1
4971 ; GCN-NEXT: v_readlane_b32 s30, v20, 0
4972 ; GCN-NEXT: s_mov_b32 s32, s33
4973 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4974 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
4975 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4976 ; GCN-NEXT: s_mov_b32 s33, s18
4977 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4978 ; GCN-NEXT: s_setpc_b64 s[30:31]
4980 ; GFX7-LABEL: test_call_v16bf16:
4981 ; GFX7: ; %bb.0: ; %entry
4982 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4983 ; GFX7-NEXT: s_mov_b32 s18, s33
4984 ; GFX7-NEXT: s_mov_b32 s33, s32
4985 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4986 ; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
4987 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4988 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4989 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4990 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4991 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4992 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4993 ; GFX7-NEXT: v_writelane_b32 v18, s30, 0
4994 ; GFX7-NEXT: v_writelane_b32 v18, s31, 1
4995 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4996 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4997 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
4998 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
4999 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
5000 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16
5001 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
5002 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
5003 ; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
5004 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5005 ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16
5006 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
5007 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
5008 ; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen
5009 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5010 ; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16
5011 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
5012 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
5013 ; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen
5014 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5015 ; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16
5016 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
5017 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
5018 ; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen
5019 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5020 ; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16
5021 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
5022 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
5023 ; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen
5024 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5025 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16
5026 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
5027 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
5028 ; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen
5029 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5030 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16
5031 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
5032 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
5033 ; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen
5034 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5035 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16
5036 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
5037 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
5038 ; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen
5039 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5040 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16
5041 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
5042 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
5043 ; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen
5044 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5045 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16
5046 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
5047 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
5048 ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen
5049 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5050 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16
5051 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
5052 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
5053 ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen
5054 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5055 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16
5056 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
5057 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
5058 ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen
5059 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5060 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16
5061 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
5062 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
5063 ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen
5064 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5065 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16
5066 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
5067 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5068 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
5069 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5070 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16
5071 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5072 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
5073 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5074 ; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
5075 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5076 ; GFX7-NEXT: v_readlane_b32 s31, v18, 1
5077 ; GFX7-NEXT: v_readlane_b32 s30, v18, 0
5078 ; GFX7-NEXT: s_mov_b32 s32, s33
5079 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
5080 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
5081 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
5082 ; GFX7-NEXT: s_mov_b32 s33, s18
5083 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5084 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5086 ; GFX8-LABEL: test_call_v16bf16:
5087 ; GFX8: ; %bb.0: ; %entry
5088 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5089 ; GFX8-NEXT: s_mov_b32 s18, s33
5090 ; GFX8-NEXT: s_mov_b32 s33, s32
5091 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
5092 ; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
5093 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
5094 ; GFX8-NEXT: s_addk_i32 s32, 0x400
5095 ; GFX8-NEXT: s_getpc_b64 s[16:17]
5096 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5097 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5098 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
5099 ; GFX8-NEXT: v_writelane_b32 v10, s30, 0
5100 ; GFX8-NEXT: v_writelane_b32 v10, s31, 1
5101 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
5102 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
5103 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8
5104 ; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
5105 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5106 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8
5107 ; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
5108 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5109 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v8
5110 ; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
5111 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5112 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v8
5113 ; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
5114 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5115 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 12, v8
5116 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
5117 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5118 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v8
5119 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
5120 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5121 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8
5122 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
5123 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5124 ; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
5125 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5126 ; GFX8-NEXT: v_readlane_b32 s31, v10, 1
5127 ; GFX8-NEXT: v_readlane_b32 s30, v10, 0
5128 ; GFX8-NEXT: s_mov_b32 s32, s33
5129 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
5130 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
5131 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
5132 ; GFX8-NEXT: s_mov_b32 s33, s18
5133 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5134 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5136 ; GFX9-LABEL: test_call_v16bf16:
5137 ; GFX9: ; %bb.0: ; %entry
5138 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5139 ; GFX9-NEXT: s_mov_b32 s18, s33
5140 ; GFX9-NEXT: s_mov_b32 s33, s32
5141 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
5142 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
5143 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
5144 ; GFX9-NEXT: s_addk_i32 s32, 0x400
5145 ; GFX9-NEXT: s_getpc_b64 s[16:17]
5146 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5147 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5148 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
5149 ; GFX9-NEXT: v_writelane_b32 v9, s30, 0
5150 ; GFX9-NEXT: v_writelane_b32 v9, s31, 1
5151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5152 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
5153 ; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
5154 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5155 ; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
5156 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5157 ; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
5158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5159 ; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
5160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5161 ; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
5162 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5163 ; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
5164 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5165 ; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
5166 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5167 ; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
5168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5169 ; GFX9-NEXT: v_readlane_b32 s31, v9, 1
5170 ; GFX9-NEXT: v_readlane_b32 s30, v9, 0
5171 ; GFX9-NEXT: s_mov_b32 s32, s33
5172 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
5173 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
5174 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
5175 ; GFX9-NEXT: s_mov_b32 s33, s18
5176 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5177 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5179 ; GFX10-LABEL: test_call_v16bf16:
5180 ; GFX10: ; %bb.0: ; %entry
5181 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5182 ; GFX10-NEXT: s_mov_b32 s18, s33
5183 ; GFX10-NEXT: s_mov_b32 s33, s32
5184 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
5185 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
5186 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5187 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
5188 ; GFX10-NEXT: s_addk_i32 s32, 0x200
5189 ; GFX10-NEXT: s_getpc_b64 s[16:17]
5190 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5191 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5192 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0
5193 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
5194 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1
5195 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
5196 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
5197 ; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
5198 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5199 ; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
5200 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5201 ; GFX10-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
5202 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5203 ; GFX10-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
5204 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5205 ; GFX10-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
5206 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5207 ; GFX10-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
5208 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5209 ; GFX10-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
5210 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5211 ; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
5212 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5213 ; GFX10-NEXT: v_readlane_b32 s31, v9, 1
5214 ; GFX10-NEXT: v_readlane_b32 s30, v9, 0
5215 ; GFX10-NEXT: s_mov_b32 s32, s33
5216 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
5217 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
5218 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5219 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
5220 ; GFX10-NEXT: s_mov_b32 s33, s18
5221 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5222 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5224 ; GFX11-LABEL: test_call_v16bf16:
5225 ; GFX11: ; %bb.0: ; %entry
5226 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5227 ; GFX11-NEXT: s_mov_b32 s2, s33
5228 ; GFX11-NEXT: s_mov_b32 s33, s32
5229 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
5230 ; GFX11-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
5231 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
5232 ; GFX11-NEXT: s_add_i32 s32, s32, 16
5233 ; GFX11-NEXT: s_getpc_b64 s[0:1]
5234 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
5235 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
5236 ; GFX11-NEXT: v_writelane_b32 v9, s30, 0
5237 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
5238 ; GFX11-NEXT: v_writelane_b32 v9, s31, 1
5239 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
5240 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
5241 ; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc
5242 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5243 ; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc
5244 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5245 ; GFX11-NEXT: v_readlane_b32 s31, v9, 1
5246 ; GFX11-NEXT: v_readlane_b32 s30, v9, 0
5247 ; GFX11-NEXT: s_mov_b32 s32, s33
5248 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
5249 ; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
5250 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
5251 ; GFX11-NEXT: s_mov_b32 s33, s2
5252 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5253 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5255 %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
5256 store volatile <16 x bfloat> %result, ptr addrspace(5) %out
5260 define bfloat @test_alloca_load_store_ret(bfloat %in) {
5261 ; GCN-LABEL: test_alloca_load_store_ret:
5262 ; GCN: ; %bb.0: ; %entry
5263 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5264 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
5265 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5266 ; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
5267 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5268 ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5269 ; GCN-NEXT: s_waitcnt vmcnt(0)
5270 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
5271 ; GCN-NEXT: s_setpc_b64 s[30:31]
5273 ; GFX7-LABEL: test_alloca_load_store_ret:
5274 ; GFX7: ; %bb.0: ; %entry
5275 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5276 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
5277 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5278 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
5279 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5280 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5281 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5282 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
5283 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5285 ; GFX8-LABEL: test_alloca_load_store_ret:
5286 ; GFX8: ; %bb.0: ; %entry
5287 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5288 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
5289 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5290 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5291 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5292 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5294 ; GFX9-LABEL: test_alloca_load_store_ret:
5295 ; GFX9: ; %bb.0: ; %entry
5296 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5297 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
5298 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5299 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5301 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5303 ; GFX10-LABEL: test_alloca_load_store_ret:
5304 ; GFX10: ; %bb.0: ; %entry
5305 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5306 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
5307 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5308 ; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc
5309 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5310 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5312 ; GFX11-LABEL: test_alloca_load_store_ret:
5313 ; GFX11: ; %bb.0: ; %entry
5314 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5315 ; GFX11-NEXT: scratch_store_b16 off, v0, s32 dlc
5316 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5317 ; GFX11-NEXT: scratch_load_u16 v0, off, s32 glc dlc
5318 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5319 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5321 %in.addr = alloca bfloat, align 2, addrspace(5)
5322 store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
5323 %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
5327 define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
5328 ; GCN-LABEL: test_overflow_stack:
5330 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5331 ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5332 ; GCN-NEXT: s_waitcnt expcnt(0)
5333 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
5334 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0
5335 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
5336 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32
5337 ; GCN-NEXT: s_waitcnt vmcnt(2)
5338 ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5339 ; GCN-NEXT: s_waitcnt expcnt(0)
5340 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
5341 ; GCN-NEXT: s_waitcnt vmcnt(2)
5342 ; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen
5343 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
5344 ; GCN-NEXT: s_waitcnt vmcnt(2)
5345 ; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen
5346 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
5347 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
5348 ; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
5349 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
5350 ; GCN-NEXT: s_waitcnt expcnt(0)
5351 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0
5352 ; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen
5353 ; GCN-NEXT: s_waitcnt expcnt(0)
5354 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0
5355 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0
5356 ; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
5357 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
5358 ; GCN-NEXT: s_waitcnt expcnt(0)
5359 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
5360 ; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen
5361 ; GCN-NEXT: s_waitcnt expcnt(0)
5362 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0
5363 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0
5364 ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
5365 ; GCN-NEXT: s_waitcnt expcnt(0)
5366 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
5367 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
5368 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
5369 ; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
5370 ; GCN-NEXT: s_waitcnt expcnt(0)
5371 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
5372 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
5373 ; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
5374 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0
5375 ; GCN-NEXT: s_waitcnt expcnt(0)
5376 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0
5377 ; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen
5378 ; GCN-NEXT: s_waitcnt expcnt(0)
5379 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0
5380 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0
5381 ; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen
5382 ; GCN-NEXT: s_waitcnt expcnt(0)
5383 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0
5384 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0
5385 ; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen
5386 ; GCN-NEXT: s_waitcnt expcnt(0)
5387 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0
5388 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
5389 ; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen
5390 ; GCN-NEXT: s_waitcnt expcnt(0)
5391 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0
5392 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0
5393 ; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen
5394 ; GCN-NEXT: s_waitcnt expcnt(0)
5395 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0
5396 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0
5397 ; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen
5398 ; GCN-NEXT: s_waitcnt expcnt(0)
5399 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0
5400 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0
5401 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
5402 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5403 ; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen
5404 ; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
5405 ; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen
5406 ; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen
5407 ; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen
5408 ; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen
5409 ; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
5410 ; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
5411 ; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen
5412 ; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen
5413 ; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen
5414 ; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
5415 ; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen
5416 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
5417 ; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
5418 ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
5419 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5420 ; GCN-NEXT: s_setpc_b64 s[30:31]
5422 ; GFX7-LABEL: test_overflow_stack:
5424 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5425 ; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5426 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
5427 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0
5428 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
5429 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5430 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5431 ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5432 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
5433 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
5434 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5435 ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5436 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32
5437 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0
5438 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5439 ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5440 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
5441 ; GFX7-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
5442 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
5443 ; GFX7-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen
5444 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
5445 ; GFX7-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
5446 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
5447 ; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen
5448 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
5449 ; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen
5450 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
5451 ; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen
5452 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
5453 ; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
5454 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
5455 ; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
5456 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
5457 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
5458 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
5459 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
5460 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
5461 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
5462 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
5463 ; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen
5464 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0
5465 ; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
5466 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0
5467 ; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen
5468 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
5469 ; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
5470 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0
5471 ; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
5472 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0
5473 ; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
5474 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
5475 ; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
5476 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0
5477 ; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
5478 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0
5479 ; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
5480 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0
5481 ; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
5482 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0
5483 ; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
5484 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0
5485 ; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
5486 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0
5487 ; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
5488 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
5489 ; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
5490 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
5491 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
5492 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
5493 ; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5494 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
5495 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
5496 ; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
5497 ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
5498 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5499 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5501 ; GFX8-LABEL: test_overflow_stack:
5503 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5504 ; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5505 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
5506 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0
5507 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5508 ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5509 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
5510 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x78, v0
5511 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5512 ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5513 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32
5514 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x74, v0
5515 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5516 ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5517 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
5518 ; GFX8-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
5519 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
5520 ; GFX8-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen
5521 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
5522 ; GFX8-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
5523 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
5524 ; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen
5525 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
5526 ; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen
5527 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
5528 ; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen
5529 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
5530 ; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
5531 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
5532 ; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
5533 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
5534 ; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
5535 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
5536 ; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
5537 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
5538 ; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
5539 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
5540 ; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen
5541 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0
5542 ; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
5543 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0
5544 ; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen
5545 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0
5546 ; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
5547 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0
5548 ; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
5549 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0
5550 ; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
5551 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0
5552 ; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
5553 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0
5554 ; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
5555 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0
5556 ; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
5557 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
5558 ; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
5559 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0
5560 ; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
5561 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0
5562 ; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
5563 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0
5564 ; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
5565 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
5566 ; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
5567 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0
5568 ; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
5569 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
5570 ; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5571 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
5572 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
5573 ; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
5574 ; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
5575 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5576 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5578 ; GFX9-LABEL: test_overflow_stack:
5580 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5581 ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
5582 ; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
5583 ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
5584 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
5585 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
5586 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
5587 ; GFX9-NEXT: s_nop 0
5588 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8
5589 ; GFX9-NEXT: s_nop 0
5590 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
5591 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32
5592 ; GFX9-NEXT: s_nop 0
5593 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
5594 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
5595 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
5596 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
5597 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
5598 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
5599 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
5600 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
5601 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
5602 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
5603 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
5604 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
5605 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
5606 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
5607 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
5608 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
5609 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
5610 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
5611 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
5612 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
5613 ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
5614 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
5615 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5616 ; GFX9-NEXT: s_waitcnt vmcnt(25)
5617 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
5618 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
5619 ; GFX9-NEXT: s_waitcnt vmcnt(25)
5620 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
5621 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
5622 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5625 ; GFX10-LABEL: test_overflow_stack:
5627 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5628 ; GFX10-NEXT: s_clause 0x2
5629 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
5630 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
5631 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32
5632 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
5633 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
5634 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
5635 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
5636 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
5637 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
5638 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
5639 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
5640 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
5641 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
5642 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
5643 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
5644 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
5645 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
5646 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
5647 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
5648 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
5649 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
5650 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
5651 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
5652 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
5653 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
5654 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
5655 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
5656 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
5657 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
5658 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
5659 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
5660 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5661 ; GFX10-NEXT: s_waitcnt vmcnt(2)
5662 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
5663 ; GFX10-NEXT: s_waitcnt vmcnt(1)
5664 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
5665 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5666 ; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
5667 ; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
5668 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5670 ; GFX11-LABEL: test_overflow_stack:
5672 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5673 ; GFX11-NEXT: s_clause 0x2
5674 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
5675 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
5676 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
5677 ; GFX11-NEXT: s_clause 0x5
5678 ; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
5679 ; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
5680 ; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
5681 ; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5682 ; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
5683 ; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
5684 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5685 ; GFX11-NEXT: s_clause 0x2
5686 ; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
5687 ; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
5688 ; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128
5689 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5690 %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
5691 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
5692 ret { <32 x i32>, bfloat } %ins.1
5695 define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
5696 ; GCN-LABEL: global_extload_v2bf16_to_v2f32:
5698 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5699 ; GCN-NEXT: s_mov_b32 s6, 0
5700 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5701 ; GCN-NEXT: s_mov_b32 s4, s6
5702 ; GCN-NEXT: s_mov_b32 s5, s6
5703 ; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
5704 ; GCN-NEXT: s_waitcnt vmcnt(0)
5705 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5706 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5707 ; GCN-NEXT: s_setpc_b64 s[30:31]
5709 ; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
5711 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5712 ; GFX7-NEXT: s_mov_b32 s6, 0
5713 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5714 ; GFX7-NEXT: s_mov_b32 s4, s6
5715 ; GFX7-NEXT: s_mov_b32 s5, s6
5716 ; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
5717 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5718 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5719 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5720 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5722 ; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
5724 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5725 ; GFX8-NEXT: flat_load_dword v1, v[0:1]
5726 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5727 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5728 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5729 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5731 ; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
5733 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5734 ; GFX9-NEXT: global_load_dword v1, v[0:1], off
5735 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5736 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5737 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5738 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5740 ; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
5742 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5743 ; GFX10-NEXT: global_load_dword v1, v[0:1], off
5744 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5745 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5746 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5747 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5749 ; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
5751 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5752 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off
5753 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5754 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5755 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5756 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5757 %load = load <2 x bfloat>, ptr addrspace(1) %ptr
5758 %fpext = fpext <2 x bfloat> %load to <2 x float>
5759 ret <2 x float> %fpext
5762 define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
5763 ; GCN-LABEL: global_extload_v3bf16_to_v3f32:
5765 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5766 ; GCN-NEXT: s_mov_b32 s6, 0
5767 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5768 ; GCN-NEXT: s_mov_b32 s4, s6
5769 ; GCN-NEXT: s_mov_b32 s5, s6
5770 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
5771 ; GCN-NEXT: s_waitcnt vmcnt(0)
5772 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5773 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5774 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5775 ; GCN-NEXT: s_setpc_b64 s[30:31]
5777 ; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
5779 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5780 ; GFX7-NEXT: s_mov_b32 s6, 0
5781 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5782 ; GFX7-NEXT: s_mov_b32 s4, s6
5783 ; GFX7-NEXT: s_mov_b32 s5, s6
5784 ; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
5785 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5786 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5787 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5788 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5789 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5791 ; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
5793 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5794 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
5795 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5796 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5797 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5798 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5799 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5801 ; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
5803 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5804 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
5805 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5806 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5807 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5808 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5809 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5811 ; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
5813 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5814 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
5815 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5816 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5817 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5818 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5819 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5821 ; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
5823 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5824 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
5825 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5826 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5827 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5828 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5829 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5830 %load = load <3 x bfloat>, ptr addrspace(1) %ptr
5831 %fpext = fpext <3 x bfloat> %load to <3 x float>
5832 ret <3 x float> %fpext
5835 define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
5836 ; GCN-LABEL: global_extload_v4bf16_to_v4f32:
5838 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5839 ; GCN-NEXT: s_mov_b32 s6, 0
5840 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5841 ; GCN-NEXT: s_mov_b32 s4, s6
5842 ; GCN-NEXT: s_mov_b32 s5, s6
5843 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5844 ; GCN-NEXT: s_waitcnt vmcnt(0)
5845 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5846 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5847 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5848 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5849 ; GCN-NEXT: s_setpc_b64 s[30:31]
5851 ; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
5853 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5854 ; GFX7-NEXT: s_mov_b32 s6, 0
5855 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5856 ; GFX7-NEXT: s_mov_b32 s4, s6
5857 ; GFX7-NEXT: s_mov_b32 s5, s6
5858 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5859 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5860 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5861 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5862 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5863 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5864 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5866 ; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
5868 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5869 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5870 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5871 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5872 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5873 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5874 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5875 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5877 ; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
5879 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5880 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
5881 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5882 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5883 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5884 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5885 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5886 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5888 ; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
5890 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5891 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
5892 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5893 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5894 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5895 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5896 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5897 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5899 ; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
5901 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5902 ; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
5903 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5904 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5905 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5906 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5907 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5908 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5909 %load = load <4 x bfloat>, ptr addrspace(1) %ptr
5910 %fpext = fpext <4 x bfloat> %load to <4 x float>
5911 ret <4 x float> %fpext
5914 define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
5915 ; GCN-LABEL: global_extload_v5bf16_to_v5f32:
5917 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5918 ; GCN-NEXT: s_mov_b32 s6, 0
5919 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5920 ; GCN-NEXT: s_mov_b32 s4, s6
5921 ; GCN-NEXT: s_mov_b32 s5, s6
5922 ; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
5923 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5924 ; GCN-NEXT: s_waitcnt vmcnt(1)
5925 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5926 ; GCN-NEXT: s_waitcnt vmcnt(0)
5927 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5928 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5929 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5930 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5931 ; GCN-NEXT: s_setpc_b64 s[30:31]
5933 ; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
5935 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5936 ; GFX7-NEXT: s_mov_b32 s6, 0
5937 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5938 ; GFX7-NEXT: s_mov_b32 s4, s6
5939 ; GFX7-NEXT: s_mov_b32 s5, s6
5940 ; GFX7-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
5941 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5942 ; GFX7-NEXT: s_waitcnt vmcnt(1)
5943 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5944 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5945 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5946 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5947 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5948 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5949 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5951 ; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
5953 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5954 ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
5955 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5956 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5957 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5958 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5959 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5960 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5961 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5963 ; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
5965 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5966 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
5967 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5968 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5969 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5970 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5971 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5972 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5973 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5975 ; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
5977 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5978 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
5979 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5980 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5981 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5982 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5983 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5984 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5985 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5987 ; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
5989 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5990 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
5991 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5992 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5993 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5994 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5995 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5996 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5997 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5998 %load = load <5 x bfloat>, ptr addrspace(1) %ptr
5999 %fpext = fpext <5 x bfloat> %load to <5 x float>
6000 ret <5 x float> %fpext
6003 define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
6004 ; GCN-LABEL: global_extload_v6bf16_to_v6f32:
6006 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6007 ; GCN-NEXT: s_mov_b32 s6, 0
6008 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6009 ; GCN-NEXT: s_mov_b32 s4, s6
6010 ; GCN-NEXT: s_mov_b32 s5, s6
6011 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
6012 ; GCN-NEXT: s_waitcnt vmcnt(0)
6013 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6014 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6015 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6016 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6017 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6018 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6019 ; GCN-NEXT: s_setpc_b64 s[30:31]
6021 ; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
6023 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6024 ; GFX7-NEXT: s_mov_b32 s6, 0
6025 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6026 ; GFX7-NEXT: s_mov_b32 s4, s6
6027 ; GFX7-NEXT: s_mov_b32 s5, s6
6028 ; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
6029 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6030 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6031 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6032 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6033 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6034 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6035 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6036 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6038 ; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
6040 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6041 ; GFX8-NEXT: flat_load_dwordx3 v[3:5], v[0:1]
6042 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6043 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6044 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6045 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6046 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6047 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6048 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6049 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6051 ; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
6053 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6054 ; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
6055 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6056 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6057 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6058 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6059 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6060 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6061 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6062 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6064 ; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
6066 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6067 ; GFX10-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
6068 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6069 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6070 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6071 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6072 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6073 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6074 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6075 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6077 ; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
6079 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6080 ; GFX11-NEXT: global_load_b96 v[3:5], v[0:1], off
6081 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6082 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6083 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6084 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6085 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6086 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6087 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6088 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6089 %load = load <6 x bfloat>, ptr addrspace(1) %ptr
6090 %fpext = fpext <6 x bfloat> %load to <6 x float>
6091 ret <6 x float> %fpext
6094 define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
6095 ; GCN-LABEL: global_extload_v8bf16_to_v8f32:
6097 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6098 ; GCN-NEXT: s_mov_b32 s6, 0
6099 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6100 ; GCN-NEXT: s_mov_b32 s4, s6
6101 ; GCN-NEXT: s_mov_b32 s5, s6
6102 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6103 ; GCN-NEXT: s_waitcnt vmcnt(0)
6104 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6105 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6106 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6107 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6108 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6109 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6110 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6111 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6112 ; GCN-NEXT: s_setpc_b64 s[30:31]
6114 ; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
6116 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6117 ; GFX7-NEXT: s_mov_b32 s6, 0
6118 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6119 ; GFX7-NEXT: s_mov_b32 s4, s6
6120 ; GFX7-NEXT: s_mov_b32 s5, s6
6121 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6122 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6123 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6124 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6125 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6126 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6127 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6128 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6129 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6130 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6131 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6133 ; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
6135 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6136 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
6137 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6138 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6139 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6140 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6141 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6142 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6143 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6144 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6145 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6146 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6148 ; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
6150 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6151 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6152 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6153 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6154 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6155 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6156 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6157 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6158 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6159 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6160 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6161 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6163 ; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
6165 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6166 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6167 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6168 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6169 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6170 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6171 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6172 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6173 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6174 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6175 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6176 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6178 ; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
6180 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6181 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
6182 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6183 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6184 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6185 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6186 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6187 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6188 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6189 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6190 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6191 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6192 %load = load <8 x bfloat>, ptr addrspace(1) %ptr
6193 %fpext = fpext <8 x bfloat> %load to <8 x float>
6194 ret <8 x float> %fpext
6197 define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
6198 ; GCN-LABEL: global_extload_v16bf16_to_v16f32:
6200 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6201 ; GCN-NEXT: s_mov_b32 s6, 0
6202 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6203 ; GCN-NEXT: s_mov_b32 s4, s6
6204 ; GCN-NEXT: s_mov_b32 s5, s6
6205 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6206 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6207 ; GCN-NEXT: s_waitcnt vmcnt(1)
6208 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6209 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6210 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6211 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6212 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6213 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6214 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6215 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6216 ; GCN-NEXT: s_waitcnt vmcnt(0)
6217 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6218 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6219 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6220 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6221 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6222 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6223 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6224 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6225 ; GCN-NEXT: s_setpc_b64 s[30:31]
6227 ; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
6229 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6230 ; GFX7-NEXT: s_mov_b32 s6, 0
6231 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6232 ; GFX7-NEXT: s_mov_b32 s4, s6
6233 ; GFX7-NEXT: s_mov_b32 s5, s6
6234 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6235 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6236 ; GFX7-NEXT: s_waitcnt vmcnt(1)
6237 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6238 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6239 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6240 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6241 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6242 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6243 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6244 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6245 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6246 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6247 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6248 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6249 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6250 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6251 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6252 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6253 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6254 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6256 ; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
6258 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6259 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
6260 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6261 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6262 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
6263 ; GFX8-NEXT: s_waitcnt vmcnt(1)
6264 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6265 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6266 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6267 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6268 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6269 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6270 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6271 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6272 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6273 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6274 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6275 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6276 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6277 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6278 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6279 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6280 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6281 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6283 ; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
6285 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6286 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6287 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6288 ; GFX9-NEXT: s_waitcnt vmcnt(1)
6289 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6290 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6291 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6292 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6293 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6294 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6295 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6296 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6297 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6298 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6299 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6300 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6301 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6302 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6303 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6304 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6305 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6306 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6308 ; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
6310 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6311 ; GFX10-NEXT: s_clause 0x1
6312 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6313 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6314 ; GFX10-NEXT: s_waitcnt vmcnt(1)
6315 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6316 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6317 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6318 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6319 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6320 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6321 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6322 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6323 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6324 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6325 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6326 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6327 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6328 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6329 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6330 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6331 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6332 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6334 ; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
6336 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6337 ; GFX11-NEXT: s_clause 0x1
6338 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
6339 ; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
6340 ; GFX11-NEXT: s_waitcnt vmcnt(1)
6341 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6342 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6343 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6344 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6345 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6346 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6347 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6348 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6349 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6350 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6351 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6352 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6353 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6354 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6355 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6356 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6357 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6358 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6359 %load = load <16 x bfloat>, ptr addrspace(1) %ptr
6360 %fpext = fpext <16 x bfloat> %load to <16 x float>
6361 ret <16 x float> %fpext
6364 define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
6365 ; GCN-LABEL: global_extload_v32bf16_to_v32f32:
6367 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6368 ; GCN-NEXT: s_mov_b32 s6, 0
6369 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6370 ; GCN-NEXT: s_mov_b32 s4, s6
6371 ; GCN-NEXT: s_mov_b32 s5, s6
6372 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6373 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6374 ; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
6375 ; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
6376 ; GCN-NEXT: s_waitcnt vmcnt(3)
6377 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6378 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6379 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6380 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6381 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6382 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6383 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6384 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6385 ; GCN-NEXT: s_waitcnt vmcnt(2)
6386 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6387 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6388 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6389 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6390 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6391 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6392 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6393 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6394 ; GCN-NEXT: s_waitcnt vmcnt(1)
6395 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6396 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6397 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6398 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6399 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6400 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6401 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6402 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6403 ; GCN-NEXT: s_waitcnt vmcnt(0)
6404 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6405 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6406 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6407 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6408 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6409 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6410 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6411 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6412 ; GCN-NEXT: s_setpc_b64 s[30:31]
6414 ; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
6416 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6417 ; GFX7-NEXT: s_mov_b32 s6, 0
6418 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6419 ; GFX7-NEXT: s_mov_b32 s4, s6
6420 ; GFX7-NEXT: s_mov_b32 s5, s6
6421 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6422 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6423 ; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
6424 ; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
6425 ; GFX7-NEXT: s_waitcnt vmcnt(3)
6426 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6427 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6428 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6429 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6430 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6431 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6432 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6433 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6434 ; GFX7-NEXT: s_waitcnt vmcnt(2)
6435 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6436 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6437 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6438 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6439 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6440 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6441 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6442 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6443 ; GFX7-NEXT: s_waitcnt vmcnt(1)
6444 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6445 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6446 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6447 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6448 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6449 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6450 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6451 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6452 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6453 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6454 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6455 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6456 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6457 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6458 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6459 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6460 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6461 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6463 ; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
6465 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6466 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
6467 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
6468 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
6469 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[2:3]
6470 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
6471 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
6472 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
6473 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6474 ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[2:3]
6475 ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
6476 ; GFX8-NEXT: s_waitcnt vmcnt(3)
6477 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6478 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6479 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6480 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6481 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6482 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6483 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6484 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6485 ; GFX8-NEXT: s_waitcnt vmcnt(2)
6486 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6487 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6488 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6489 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6490 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6491 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6492 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6493 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6494 ; GFX8-NEXT: s_waitcnt vmcnt(1)
6495 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6496 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6497 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6498 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6499 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6500 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6501 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6502 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6503 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6504 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6505 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6506 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6507 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6508 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6509 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6510 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6511 ; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6512 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6514 ; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
6516 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6517 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6518 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6519 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
6520 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
6521 ; GFX9-NEXT: s_waitcnt vmcnt(3)
6522 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6523 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6524 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6525 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6526 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6527 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6528 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6529 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6530 ; GFX9-NEXT: s_waitcnt vmcnt(2)
6531 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6532 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6533 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6534 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6535 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6536 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6537 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6538 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6539 ; GFX9-NEXT: s_waitcnt vmcnt(1)
6540 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6541 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6542 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6543 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6544 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6545 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6546 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6547 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6549 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6550 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6551 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6552 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6553 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6554 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6555 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6556 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6557 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6559 ; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
6561 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6562 ; GFX10-NEXT: s_clause 0x3
6563 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6564 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6565 ; GFX10-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
6566 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
6567 ; GFX10-NEXT: s_waitcnt vmcnt(3)
6568 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6569 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6570 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6571 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6572 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6573 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6574 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6575 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6576 ; GFX10-NEXT: s_waitcnt vmcnt(2)
6577 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6578 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6579 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6580 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6581 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6582 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6583 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6584 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6585 ; GFX10-NEXT: s_waitcnt vmcnt(1)
6586 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6587 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6588 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6589 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6590 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6591 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6592 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6593 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6594 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6595 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6596 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6597 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6598 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6599 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6600 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6601 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6602 ; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6603 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6605 ; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
6607 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6608 ; GFX11-NEXT: s_clause 0x3
6609 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
6610 ; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
6611 ; GFX11-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32
6612 ; GFX11-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48
6613 ; GFX11-NEXT: s_waitcnt vmcnt(3)
6614 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6615 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6616 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6617 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6618 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6619 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6620 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6621 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6622 ; GFX11-NEXT: s_waitcnt vmcnt(2)
6623 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6624 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6625 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6626 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6627 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6628 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6629 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6630 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6631 ; GFX11-NEXT: s_waitcnt vmcnt(1)
6632 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6633 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6634 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6635 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6636 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6637 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6638 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6639 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6640 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6641 ; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6642 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6643 ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6644 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6645 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6646 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6647 ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6648 ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6649 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6650 %load = load <32 x bfloat>, ptr addrspace(1) %ptr
6651 %fpext = fpext <32 x bfloat> %load to <32 x float>
6652 ret <32 x float> %fpext
6655 define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
6656 ; GCN-LABEL: global_extload_v2bf16_to_v2f64:
6658 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6659 ; GCN-NEXT: s_mov_b32 s6, 0
6660 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6661 ; GCN-NEXT: s_mov_b32 s4, s6
6662 ; GCN-NEXT: s_mov_b32 s5, s6
6663 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
6664 ; GCN-NEXT: s_waitcnt vmcnt(0)
6665 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0
6666 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
6667 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
6668 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6669 ; GCN-NEXT: s_setpc_b64 s[30:31]
6671 ; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
6673 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6674 ; GFX7-NEXT: s_mov_b32 s6, 0
6675 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6676 ; GFX7-NEXT: s_mov_b32 s4, s6
6677 ; GFX7-NEXT: s_mov_b32 s5, s6
6678 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
6679 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6680 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
6681 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
6682 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6683 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6684 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6686 ; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
6688 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6689 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
6690 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6691 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
6692 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
6693 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6694 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6695 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6697 ; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
6699 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6700 ; GFX9-NEXT: global_load_dword v2, v[0:1], off
6701 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6702 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
6703 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
6704 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6705 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6706 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6708 ; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
6710 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6711 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
6712 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6713 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
6714 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
6715 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
6716 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6717 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6719 ; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
6721 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6722 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
6723 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6724 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
6725 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
6726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6727 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
6728 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6729 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6730 %load = load <2 x bfloat>, ptr addrspace(1) %ptr
6731 %fpext = fpext <2 x bfloat> %load to <2 x double>
6732 ret <2 x double> %fpext
6735 define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
6736 ; GCN-LABEL: global_extload_v3bf16_to_v3f64:
6738 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6739 ; GCN-NEXT: s_mov_b32 s6, 0
6740 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6741 ; GCN-NEXT: s_mov_b32 s4, s6
6742 ; GCN-NEXT: s_mov_b32 s5, s6
6743 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6744 ; GCN-NEXT: s_waitcnt vmcnt(0)
6745 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6746 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6747 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6748 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6749 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6750 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6751 ; GCN-NEXT: s_setpc_b64 s[30:31]
6753 ; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
6755 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6756 ; GFX7-NEXT: s_mov_b32 s6, 0
6757 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6758 ; GFX7-NEXT: s_mov_b32 s4, s6
6759 ; GFX7-NEXT: s_mov_b32 s5, s6
6760 ; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
6761 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6762 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6763 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6764 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6765 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6766 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6767 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6768 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6770 ; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
6772 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6773 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
6774 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6775 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6776 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6777 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6778 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6779 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6780 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6781 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6783 ; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
6785 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6786 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
6787 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6788 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6789 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6790 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6791 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6792 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6793 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6794 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6796 ; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
6798 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6799 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
6800 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6801 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6802 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6803 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6804 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6805 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6806 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6807 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6809 ; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
6811 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6812 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
6813 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6814 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6815 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6816 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6817 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6818 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6819 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6820 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
6821 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6822 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6823 %load = load <3 x bfloat>, ptr addrspace(1) %ptr
6824 %fpext = fpext <3 x bfloat> %load to <3 x double>
6825 ret <3 x double> %fpext
6828 define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
6829 ; GCN-LABEL: global_extload_v4bf16_to_v4f64:
6831 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6832 ; GCN-NEXT: s_mov_b32 s6, 0
6833 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6834 ; GCN-NEXT: s_mov_b32 s4, s6
6835 ; GCN-NEXT: s_mov_b32 s5, s6
6836 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6837 ; GCN-NEXT: s_waitcnt vmcnt(0)
6838 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6839 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6840 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6841 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6842 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6843 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6844 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6845 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6846 ; GCN-NEXT: s_setpc_b64 s[30:31]
6848 ; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
6850 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6851 ; GFX7-NEXT: s_mov_b32 s6, 0
6852 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6853 ; GFX7-NEXT: s_mov_b32 s4, s6
6854 ; GFX7-NEXT: s_mov_b32 s5, s6
6855 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6856 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6857 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6858 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6859 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6860 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6861 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6862 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6863 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6864 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6865 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6867 ; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
6869 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6870 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
6871 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6872 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6873 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6874 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6875 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6876 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6877 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6878 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6879 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6880 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6882 ; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
6884 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6885 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
6886 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6887 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6888 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6889 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6890 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6891 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6892 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6893 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6894 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6895 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6897 ; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
6899 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6900 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
6901 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6902 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6903 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6904 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6905 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
6906 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6907 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6908 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6909 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6910 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6912 ; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
6914 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6915 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
6916 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6917 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6918 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6919 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6920 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
6921 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6922 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6923 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6924 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6925 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6926 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6927 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6928 %load = load <4 x bfloat>, ptr addrspace(1) %ptr
6929 %fpext = fpext <4 x bfloat> %load to <4 x double>
6930 ret <4 x double> %fpext
6933 define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
6934 ; GCN-LABEL: global_extload_v5bf16_to_v5f64:
6936 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6937 ; GCN-NEXT: s_mov_b32 s6, 0
6938 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6939 ; GCN-NEXT: s_mov_b32 s4, s6
6940 ; GCN-NEXT: s_mov_b32 s5, s6
6941 ; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
6942 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6943 ; GCN-NEXT: s_waitcnt vmcnt(1)
6944 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
6945 ; GCN-NEXT: s_waitcnt vmcnt(0)
6946 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
6947 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
6948 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
6949 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6950 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
6951 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
6952 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
6953 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
6954 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6955 ; GCN-NEXT: s_setpc_b64 s[30:31]
6957 ; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
6959 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6960 ; GFX7-NEXT: s_mov_b32 s6, 0
6961 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6962 ; GFX7-NEXT: s_mov_b32 s4, s6
6963 ; GFX7-NEXT: s_mov_b32 s5, s6
6964 ; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
6965 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6966 ; GFX7-NEXT: s_waitcnt vmcnt(1)
6967 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
6968 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6969 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
6970 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
6971 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
6972 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6973 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
6974 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
6975 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
6976 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
6977 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6978 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6980 ; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
6982 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6983 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
6984 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6985 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
6986 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
6987 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
6988 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6989 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
6990 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
6991 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
6992 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
6993 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6994 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
6995 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6997 ; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
6999 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7000 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
7001 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7002 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7003 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7004 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7005 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7006 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7007 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7008 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7009 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7010 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7011 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7012 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7014 ; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
7016 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7017 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
7018 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7019 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7020 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7021 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
7022 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
7023 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
7024 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7025 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7026 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7027 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7028 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7029 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7031 ; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
7033 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7034 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
7035 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7036 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7037 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7038 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v3
7039 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
7040 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v4
7041 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7042 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7043 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7044 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7045 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7046 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7047 %load = load <5 x bfloat>, ptr addrspace(1) %ptr
7048 %fpext = fpext <5 x bfloat> %load to <5 x double>
7049 ret <5 x double> %fpext
7052 define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
7053 ; GCN-LABEL: global_extload_v6bf16_to_v6f64:
7055 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7056 ; GCN-NEXT: s_mov_b32 s6, 0
7057 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7058 ; GCN-NEXT: s_mov_b32 s4, s6
7059 ; GCN-NEXT: s_mov_b32 s5, s6
7060 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7061 ; GCN-NEXT: s_waitcnt vmcnt(0)
7062 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7063 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7064 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7065 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7066 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7067 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7068 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7069 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7070 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7071 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7072 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7073 ; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7074 ; GCN-NEXT: s_setpc_b64 s[30:31]
7076 ; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
7078 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7079 ; GFX7-NEXT: s_mov_b32 s6, 0
7080 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7081 ; GFX7-NEXT: s_mov_b32 s4, s6
7082 ; GFX7-NEXT: s_mov_b32 s5, s6
7083 ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
7084 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7085 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7086 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7087 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7088 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7089 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7090 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7091 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7092 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7093 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7094 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7095 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7096 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7097 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7099 ; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
7101 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7102 ; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
7103 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7104 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7105 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7106 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7107 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7108 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7109 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7110 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7111 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7112 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7113 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7114 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7115 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7116 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7118 ; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
7120 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7121 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
7122 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7123 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7124 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7125 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7126 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7127 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7128 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7129 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7130 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7131 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7132 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7133 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7134 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7135 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7137 ; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
7139 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7140 ; GFX10-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
7141 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7142 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
7143 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
7144 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
7145 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
7146 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6
7147 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
7148 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7149 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7150 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7151 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7152 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7153 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7154 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7156 ; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
7158 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7159 ; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off
7160 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7161 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
7162 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
7163 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
7164 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
7165 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6
7166 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
7167 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7168 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7169 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7170 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7171 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7172 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7173 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7174 %load = load <6 x bfloat>, ptr addrspace(1) %ptr
7175 %fpext = fpext <6 x bfloat> %load to <6 x double>
7176 ret <6 x double> %fpext
7179 define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
7180 ; GCN-LABEL: global_extload_v8bf16_to_v8f64:
7182 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7183 ; GCN-NEXT: s_mov_b32 s6, 0
7184 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7185 ; GCN-NEXT: s_mov_b32 s4, s6
7186 ; GCN-NEXT: s_mov_b32 s5, s6
7187 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7188 ; GCN-NEXT: s_waitcnt vmcnt(0)
7189 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7190 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7191 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7192 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7193 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7194 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7195 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7196 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7197 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7198 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7199 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7200 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7201 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7202 ; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7203 ; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7204 ; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7205 ; GCN-NEXT: s_setpc_b64 s[30:31]
7207 ; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
7209 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7210 ; GFX7-NEXT: s_mov_b32 s6, 0
7211 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7212 ; GFX7-NEXT: s_mov_b32 s4, s6
7213 ; GFX7-NEXT: s_mov_b32 s5, s6
7214 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7215 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7216 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7217 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7218 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7219 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7220 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7221 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7222 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7223 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7224 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7225 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7226 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7227 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7228 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7229 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7230 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7231 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7232 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7234 ; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
7236 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7237 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
7238 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7239 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7240 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7241 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7242 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7243 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7244 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7245 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7246 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7247 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7248 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7249 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7250 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7251 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7252 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7253 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7254 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7255 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7257 ; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
7259 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7260 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
7261 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7262 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7263 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7264 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7265 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7266 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7267 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7268 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7269 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7270 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7271 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7272 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7273 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7274 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7275 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7276 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7277 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7278 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7280 ; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
7282 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7283 ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
7284 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7285 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v7
7286 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
7287 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v8
7288 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
7289 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9
7290 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
7291 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v10
7292 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
7293 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7294 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7295 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7296 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7297 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7298 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
7299 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7300 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7301 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7303 ; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
7305 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7306 ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
7307 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7308 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
7309 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
7310 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
7311 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
7312 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
7313 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
7314 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
7315 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
7316 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7317 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7318 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7319 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7320 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7321 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
7322 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7323 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7324 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7325 %load = load <8 x bfloat>, ptr addrspace(1) %ptr
7326 %fpext = fpext <8 x bfloat> %load to <8 x double>
7327 ret <8 x double> %fpext
7330 define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
7331 ; GCN-LABEL: global_extload_v16bf16_to_v16f64:
7333 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7334 ; GCN-NEXT: s_mov_b32 s6, 0
7335 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7336 ; GCN-NEXT: s_mov_b32 s4, s6
7337 ; GCN-NEXT: s_mov_b32 s5, s6
7338 ; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
7339 ; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
7340 ; GCN-NEXT: s_waitcnt vmcnt(1)
7341 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7342 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7343 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7344 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7345 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7346 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7347 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7348 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7349 ; GCN-NEXT: s_waitcnt vmcnt(0)
7350 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7351 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7352 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7353 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7354 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7355 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7356 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7357 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7358 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7359 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7360 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7361 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7362 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7363 ; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7364 ; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7365 ; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7366 ; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7367 ; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7368 ; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7369 ; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7370 ; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7371 ; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7372 ; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7373 ; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7374 ; GCN-NEXT: s_setpc_b64 s[30:31]
7376 ; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
7378 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7379 ; GFX7-NEXT: s_mov_b32 s6, 0
7380 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7381 ; GFX7-NEXT: s_mov_b32 s4, s6
7382 ; GFX7-NEXT: s_mov_b32 s5, s6
7383 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
7384 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
7385 ; GFX7-NEXT: s_waitcnt vmcnt(1)
7386 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7387 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7388 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7389 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7390 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7391 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7392 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7393 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7394 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7395 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7396 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7397 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7398 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7399 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7400 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7401 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7402 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7403 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7404 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7405 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7406 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7407 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7408 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7409 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7410 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7411 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7412 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7413 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7414 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7415 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7416 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7417 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7418 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7419 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7421 ; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
7423 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7424 ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
7425 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7426 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7427 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
7428 ; GFX8-NEXT: s_waitcnt vmcnt(1)
7429 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7430 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7431 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7432 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7433 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7434 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7435 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7436 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7437 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7438 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7439 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7440 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7441 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7442 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7443 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7444 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7445 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7446 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7447 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7448 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7449 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7450 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7451 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7452 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7453 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7454 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7455 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7456 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7457 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7458 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7459 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7460 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7461 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7462 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7464 ; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
7466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7467 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
7468 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
7469 ; GFX9-NEXT: s_waitcnt vmcnt(1)
7470 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7471 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7472 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7473 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7474 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7475 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7476 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7477 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7478 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7479 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7480 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7481 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7482 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7483 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7484 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7485 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7486 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7487 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7488 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7489 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7490 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7491 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7492 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7493 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7494 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7495 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7496 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7497 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7498 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7499 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7500 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7501 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7502 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7503 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7505 ; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
7507 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7508 ; GFX10-NEXT: s_clause 0x1
7509 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
7510 ; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off offset:16
7511 ; GFX10-NEXT: s_waitcnt vmcnt(1)
7512 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7513 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7514 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
7515 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
7516 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
7517 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7518 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7519 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7520 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7521 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v9
7522 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
7523 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v10
7524 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v10
7525 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v11
7526 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v11
7527 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12
7528 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v12
7529 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7530 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7531 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7532 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7533 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7534 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7535 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7536 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7537 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7538 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7539 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7540 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7541 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7542 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7543 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7544 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7545 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7547 ; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
7549 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7550 ; GFX11-NEXT: s_clause 0x1
7551 ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
7552 ; GFX11-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16
7553 ; GFX11-NEXT: s_waitcnt vmcnt(1)
7554 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
7555 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
7556 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
7557 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
7558 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
7559 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
7560 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
7561 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
7562 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7563 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v23
7564 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
7565 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v24
7566 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v24
7567 ; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25
7568 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
7569 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v26
7570 ; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v26
7571 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7572 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7573 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7574 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7575 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7576 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
7577 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7578 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7579 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7580 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7581 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7582 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7583 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7584 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v27
7585 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7586 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7587 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7588 %load = load <16 x bfloat>, ptr addrspace(1) %ptr
7589 %fpext = fpext <16 x bfloat> %load to <16 x double>
7590 ret <16 x double> %fpext
7593 define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
7594 ; GCN-LABEL: global_extload_v32bf16_to_v32f64:
7596 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7597 ; GCN-NEXT: s_mov_b32 s6, 0
7598 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7599 ; GCN-NEXT: s_mov_b32 s4, s6
7600 ; GCN-NEXT: s_mov_b32 s5, s6
7601 ; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
7602 ; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
7603 ; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
7604 ; GCN-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
7605 ; GCN-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
7606 ; GCN-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
7607 ; GCN-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
7608 ; GCN-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
7609 ; GCN-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
7610 ; GCN-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
7611 ; GCN-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
7612 ; GCN-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
7613 ; GCN-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
7614 ; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
7615 ; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
7616 ; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
7617 ; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
7618 ; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
7619 ; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
7620 ; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
7621 ; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
7622 ; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
7623 ; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
7624 ; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
7625 ; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
7626 ; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
7627 ; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
7628 ; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
7629 ; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
7630 ; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
7631 ; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
7632 ; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
7633 ; GCN-NEXT: s_waitcnt vmcnt(8)
7634 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
7635 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0
7636 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7637 ; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
7638 ; GCN-NEXT: s_waitcnt expcnt(0)
7639 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
7640 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7641 ; GCN-NEXT: s_waitcnt expcnt(0)
7642 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
7643 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0
7644 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7645 ; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
7646 ; GCN-NEXT: s_waitcnt expcnt(0)
7647 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
7648 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7649 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0
7650 ; GCN-NEXT: s_waitcnt expcnt(0)
7651 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
7652 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7653 ; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
7654 ; GCN-NEXT: s_waitcnt expcnt(0)
7655 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
7656 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7657 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0
7658 ; GCN-NEXT: s_waitcnt expcnt(0)
7659 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
7660 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7661 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7662 ; GCN-NEXT: s_waitcnt expcnt(0)
7663 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
7664 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7665 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0
7666 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0
7667 ; GCN-NEXT: s_waitcnt expcnt(0)
7668 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
7669 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7670 ; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
7671 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0
7672 ; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
7673 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0
7674 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0
7675 ; GCN-NEXT: s_waitcnt expcnt(0)
7676 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
7677 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7678 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7679 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0
7680 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7681 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0
7682 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0
7683 ; GCN-NEXT: s_waitcnt expcnt(0)
7684 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
7685 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7686 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7687 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0
7688 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
7689 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0
7690 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0
7691 ; GCN-NEXT: s_waitcnt expcnt(0)
7692 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
7693 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7694 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7695 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0
7696 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7697 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0
7698 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0
7699 ; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
7700 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
7701 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7702 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
7703 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0
7704 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
7705 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0
7706 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0
7707 ; GCN-NEXT: s_waitcnt expcnt(0)
7708 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
7709 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7710 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7711 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0
7712 ; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
7713 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0
7714 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0
7715 ; GCN-NEXT: s_waitcnt expcnt(0)
7716 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
7717 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7718 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7719 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0
7720 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7721 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0
7722 ; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0
7723 ; GCN-NEXT: s_waitcnt expcnt(0)
7724 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
7725 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7726 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
7727 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0
7728 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
7729 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0
7730 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
7731 ; GCN-NEXT: s_waitcnt expcnt(0)
7732 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22
7733 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7734 ; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
7735 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0
7736 ; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
7737 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0
7738 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0
7739 ; GCN-NEXT: s_waitcnt expcnt(0)
7740 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21
7741 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7742 ; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
7743 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0
7744 ; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
7745 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0
7746 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0
7747 ; GCN-NEXT: s_waitcnt expcnt(0)
7748 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20
7749 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7750 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7751 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0
7752 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7753 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0
7754 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0
7755 ; GCN-NEXT: s_waitcnt expcnt(0)
7756 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19
7757 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7758 ; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
7759 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0
7760 ; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
7761 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0
7762 ; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
7763 ; GCN-NEXT: s_waitcnt expcnt(0)
7764 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18
7765 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7766 ; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
7767 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0
7768 ; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
7769 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
7770 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
7771 ; GCN-NEXT: s_waitcnt expcnt(0)
7772 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17
7773 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7774 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
7775 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0
7776 ; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
7777 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0
7778 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0
7779 ; GCN-NEXT: s_waitcnt expcnt(0)
7780 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16
7781 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7782 ; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
7783 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0
7784 ; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
7785 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
7786 ; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0
7787 ; GCN-NEXT: s_waitcnt expcnt(0)
7788 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
7789 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7790 ; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
7791 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0
7792 ; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
7793 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
7794 ; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0
7795 ; GCN-NEXT: s_waitcnt expcnt(0)
7796 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14
7797 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7798 ; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
7799 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0
7800 ; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen
7801 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0
7802 ; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0
7803 ; GCN-NEXT: s_waitcnt expcnt(0)
7804 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13
7805 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7806 ; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
7807 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0
7808 ; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
7809 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0
7810 ; GCN-NEXT: s_waitcnt expcnt(0)
7811 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12
7812 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
7813 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
7814 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
7815 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7816 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v4
7817 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7818 ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v6
7819 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
7820 ; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8
7821 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7822 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
7823 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
7824 ; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
7825 ; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
7826 ; GCN-NEXT: s_waitcnt expcnt(0)
7827 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9
7828 ; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12
7829 ; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
7830 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
7831 ; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13
7832 ; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
7833 ; GCN-NEXT: s_waitcnt expcnt(0)
7834 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14
7835 ; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15
7836 ; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
7837 ; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen
7838 ; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen
7839 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
7840 ; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
7841 ; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen
7842 ; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen
7843 ; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen
7844 ; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen
7845 ; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen
7846 ; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen
7847 ; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen
7848 ; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
7849 ; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen
7850 ; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
7851 ; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen
7852 ; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
7853 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7854 ; GCN-NEXT: s_setpc_b64 s[30:31]
7856 ; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
7858 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7859 ; GFX7-NEXT: s_mov_b32 s6, 0
7860 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7861 ; GFX7-NEXT: s_mov_b32 s4, s6
7862 ; GFX7-NEXT: s_mov_b32 s5, s6
7863 ; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62
7864 ; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60
7865 ; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58
7866 ; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56
7867 ; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54
7868 ; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
7869 ; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
7870 ; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
7871 ; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
7872 ; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
7873 ; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
7874 ; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
7875 ; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
7876 ; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
7877 ; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
7878 ; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
7879 ; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
7880 ; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
7881 ; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
7882 ; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
7883 ; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8
7884 ; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
7885 ; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12
7886 ; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
7887 ; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16
7888 ; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18
7889 ; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20
7890 ; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22
7891 ; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
7892 ; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
7893 ; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
7894 ; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
7895 ; GFX7-NEXT: s_waitcnt vmcnt(14)
7896 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17
7897 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7898 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0
7899 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7900 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
7901 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7902 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18
7903 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7904 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0
7905 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0
7906 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7907 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
7908 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7909 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19
7910 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7911 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0
7912 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0
7913 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7914 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
7915 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7916 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20
7917 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7918 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0
7919 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0
7920 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7921 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
7922 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21
7923 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7924 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17
7925 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0
7926 ; GFX7-NEXT: s_waitcnt vmcnt(14)
7927 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
7928 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7929 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
7930 ; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
7931 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
7932 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23
7933 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
7934 ; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
7935 ; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen
7936 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0
7937 ; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
7938 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24
7939 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7940 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0
7941 ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
7942 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0
7943 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7944 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31
7945 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
7946 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0
7947 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7948 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0
7949 ; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
7950 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30
7951 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7952 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0
7953 ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
7954 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0
7955 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7956 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29
7957 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
7958 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0
7959 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7960 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0
7961 ; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
7962 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28
7963 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7964 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0
7965 ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
7966 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0
7967 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7968 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
7969 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
7970 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0
7971 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7972 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0
7973 ; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
7974 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26
7975 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7976 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0
7977 ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
7978 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0
7979 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7980 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25
7981 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
7982 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0
7983 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7984 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0
7985 ; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
7986 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0
7987 ; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
7988 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16
7989 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7990 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0
7991 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32
7992 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
7993 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0
7994 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7995 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16
7996 ; GFX7-NEXT: s_waitcnt vmcnt(14)
7997 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34
7998 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7999 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0
8000 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0
8001 ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
8002 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
8003 ; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
8004 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33
8005 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
8006 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
8007 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
8008 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
8009 ; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
8010 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0
8011 ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
8012 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13
8013 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14
8014 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
8015 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0
8016 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8017 ; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen
8018 ; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0
8019 ; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
8020 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
8021 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12
8022 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
8023 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0
8024 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
8025 ; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
8026 ; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0
8027 ; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
8028 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
8029 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
8030 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
8031 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0
8032 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
8033 ; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
8034 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0
8035 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7
8036 ; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
8037 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4
8038 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
8039 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
8040 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
8041 ; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
8042 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
8043 ; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen
8044 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0
8045 ; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen
8046 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0
8047 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
8048 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8049 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10
8050 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0
8051 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
8052 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
8053 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0
8054 ; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8055 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0
8056 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
8057 ; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen
8058 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0
8059 ; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen
8060 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0
8061 ; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
8062 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
8063 ; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
8064 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0
8065 ; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
8066 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
8067 ; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
8068 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0
8069 ; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
8070 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
8071 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
8072 ; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen
8073 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0
8074 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
8075 ; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
8076 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
8077 ; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
8078 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0
8079 ; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen
8080 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0
8081 ; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen
8082 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0
8083 ; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
8084 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0
8085 ; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
8086 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
8087 ; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
8088 ; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
8089 ; GFX7-NEXT: s_waitcnt vmcnt(0)
8090 ; GFX7-NEXT: s_setpc_b64 s[30:31]
8092 ; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
8094 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8095 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1
8096 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
8097 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1
8098 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
8099 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1
8100 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
8101 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1
8102 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
8103 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1
8104 ; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
8105 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1
8106 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
8107 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1
8108 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
8109 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1
8110 ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
8111 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1
8112 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
8113 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1
8114 ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
8115 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
8116 ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
8117 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
8118 ; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc
8119 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1
8120 ; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
8121 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1
8122 ; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
8123 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
8124 ; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
8125 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1
8126 ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
8127 ; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
8128 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
8129 ; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
8130 ; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
8131 ; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
8132 ; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
8133 ; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
8134 ; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
8135 ; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
8136 ; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
8137 ; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
8138 ; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
8139 ; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
8140 ; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
8141 ; GFX8-NEXT: flat_load_ushort v44, v[1:2]
8142 ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
8143 ; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
8144 ; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
8145 ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
8146 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
8147 ; GFX8-NEXT: flat_load_ushort v45, v[50:51]
8148 ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
8149 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
8150 ; GFX8-NEXT: flat_load_ushort v46, v[50:51]
8151 ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1
8152 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
8153 ; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
8154 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
8155 ; GFX8-NEXT: flat_load_ushort v47, v[52:53]
8156 ; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1
8157 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
8158 ; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
8159 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
8160 ; GFX8-NEXT: flat_load_ushort v56, v[54:55]
8161 ; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1
8162 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
8163 ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
8164 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
8165 ; GFX8-NEXT: flat_load_ushort v57, v[39:40]
8166 ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
8167 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
8168 ; GFX8-NEXT: flat_load_ushort v58, v[39:40]
8169 ; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1
8170 ; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc
8171 ; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1
8172 ; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc
8173 ; GFX8-NEXT: flat_load_ushort v42, v[42:43]
8174 ; GFX8-NEXT: flat_load_ushort v34, v[33:34]
8175 ; GFX8-NEXT: flat_load_ushort v36, v[35:36]
8176 ; GFX8-NEXT: flat_load_ushort v38, v[37:38]
8177 ; GFX8-NEXT: flat_load_ushort v39, v[48:49]
8178 ; GFX8-NEXT: flat_load_ushort v48, v[50:51]
8179 ; GFX8-NEXT: flat_load_ushort v51, v[52:53]
8180 ; GFX8-NEXT: flat_load_ushort v52, v[54:55]
8181 ; GFX8-NEXT: flat_load_ushort v53, v[40:41]
8182 ; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1
8183 ; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc
8184 ; GFX8-NEXT: flat_load_ushort v37, v[3:4]
8185 ; GFX8-NEXT: flat_load_ushort v35, v[5:6]
8186 ; GFX8-NEXT: flat_load_ushort v33, v[7:8]
8187 ; GFX8-NEXT: flat_load_ushort v8, v[9:10]
8188 ; GFX8-NEXT: flat_load_ushort v6, v[11:12]
8189 ; GFX8-NEXT: flat_load_ushort v4, v[13:14]
8190 ; GFX8-NEXT: flat_load_ushort v2, v[15:16]
8191 ; GFX8-NEXT: flat_load_ushort v1, v[19:20]
8192 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0
8193 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
8194 ; GFX8-NEXT: s_waitcnt vmcnt(14)
8195 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44
8196 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
8197 ; GFX8-NEXT: flat_load_ushort v3, v[17:18]
8198 ; GFX8-NEXT: flat_load_ushort v5, v[21:22]
8199 ; GFX8-NEXT: flat_load_ushort v7, v[23:24]
8200 ; GFX8-NEXT: flat_load_ushort v9, v[25:26]
8201 ; GFX8-NEXT: flat_load_ushort v10, v[27:28]
8202 ; GFX8-NEXT: flat_load_ushort v11, v[29:30]
8203 ; GFX8-NEXT: flat_load_ushort v12, v[31:32]
8204 ; GFX8-NEXT: flat_load_ushort v13, v[49:50]
8205 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
8206 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
8207 ; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
8208 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0
8209 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45
8210 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
8211 ; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
8212 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46
8213 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8214 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0
8215 ; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
8216 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0
8217 ; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
8218 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47
8219 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
8220 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0
8221 ; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
8222 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0
8223 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
8224 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
8225 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56
8226 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
8227 ; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
8228 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0
8229 ; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
8230 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
8231 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57
8232 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8233 ; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
8234 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0
8235 ; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
8236 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58
8237 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
8238 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0
8239 ; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
8240 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0
8241 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
8242 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42
8243 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
8244 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0
8245 ; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
8246 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0
8247 ; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
8248 ; GFX8-NEXT: s_waitcnt vmcnt(14)
8249 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
8250 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8251 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
8252 ; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
8253 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0
8254 ; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
8255 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52
8256 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
8257 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0
8258 ; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
8259 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0
8260 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
8261 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
8262 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
8263 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
8264 ; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
8265 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
8266 ; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
8267 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
8268 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8269 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0
8270 ; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
8271 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0
8272 ; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
8273 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39
8274 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
8275 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0
8276 ; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
8277 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0
8278 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
8279 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38
8280 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
8281 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0
8282 ; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
8283 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0
8284 ; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
8285 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
8286 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8287 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0
8288 ; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
8289 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0
8290 ; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
8291 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0
8292 ; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
8293 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34
8294 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
8295 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0
8296 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
8297 ; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
8298 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0
8299 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
8300 ; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
8301 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
8302 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
8303 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35
8304 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
8305 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
8306 ; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
8307 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0
8308 ; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
8309 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
8310 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
8311 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33
8312 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
8313 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
8314 ; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
8315 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0
8316 ; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
8317 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
8318 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11
8319 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0
8320 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
8321 ; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
8322 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0
8323 ; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen
8324 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8
8325 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
8326 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8
8327 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0
8328 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
8329 ; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen
8330 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0
8331 ; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen
8332 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6
8333 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
8334 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
8335 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0
8336 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8337 ; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
8338 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0
8339 ; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
8340 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4
8341 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
8342 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
8343 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0
8344 ; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen
8345 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0
8346 ; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen
8347 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
8348 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
8349 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
8350 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8351 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
8352 ; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
8353 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
8354 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
8355 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
8356 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4
8357 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0
8358 ; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
8359 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
8360 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
8361 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0
8362 ; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
8363 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0
8364 ; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8365 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0
8366 ; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
8367 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0
8368 ; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
8369 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0
8370 ; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen
8371 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0
8372 ; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
8373 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0
8374 ; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
8375 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0
8376 ; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
8377 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0
8378 ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
8379 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0
8380 ; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
8381 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0
8382 ; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
8383 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0
8384 ; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen
8385 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0
8386 ; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
8387 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0
8388 ; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
8389 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0
8390 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
8391 ; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
8392 ; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
8393 ; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
8394 ; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
8395 ; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
8396 ; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
8397 ; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
8398 ; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
8399 ; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
8400 ; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
8401 ; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
8402 ; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
8403 ; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
8404 ; GFX8-NEXT: s_waitcnt vmcnt(0)
8405 ; GFX8-NEXT: s_setpc_b64 s[30:31]
8407 ; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
8409 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8410 ; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62
8411 ; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60
8412 ; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58
8413 ; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56
8414 ; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54
8415 ; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52
8416 ; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50
8417 ; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48
8418 ; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46
8419 ; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44
8420 ; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42
8421 ; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40
8422 ; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38
8423 ; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36
8424 ; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34
8425 ; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32
8426 ; GFX9-NEXT: global_load_ushort v25, v[1:2], off
8427 ; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2
8428 ; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30
8429 ; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
8430 ; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
8431 ; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
8432 ; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
8433 ; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24
8434 ; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26
8435 ; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28
8436 ; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4
8437 ; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6
8438 ; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8
8439 ; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10
8440 ; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
8441 ; GFX9-NEXT: s_nop 0
8442 ; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
8443 ; GFX9-NEXT: s_waitcnt vmcnt(31)
8444 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8
8445 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
8446 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8447 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
8448 ; GFX9-NEXT: s_waitcnt vmcnt(28)
8449 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12
8450 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252
8451 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248
8452 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
8453 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11
8454 ; GFX9-NEXT: s_waitcnt vmcnt(29)
8455 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13
8456 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244
8457 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240
8458 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
8459 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8460 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
8461 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236
8462 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232
8463 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
8464 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
8465 ; GFX9-NEXT: s_waitcnt vmcnt(31)
8466 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15
8467 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8468 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16
8469 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228
8470 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224
8471 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
8472 ; GFX9-NEXT: s_waitcnt vmcnt(31)
8473 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
8474 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
8475 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220
8476 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216
8477 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14
8478 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
8479 ; GFX9-NEXT: s_waitcnt vmcnt(32)
8480 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18
8481 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8482 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20
8483 ; GFX9-NEXT: s_waitcnt vmcnt(28)
8484 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
8485 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212
8486 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208
8487 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
8488 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19
8489 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21
8490 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204
8491 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200
8492 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196
8493 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192
8494 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20
8495 ; GFX9-NEXT: s_waitcnt vmcnt(33)
8496 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
8497 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8498 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
8499 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19
8500 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188
8501 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184
8502 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180
8503 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176
8504 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172
8505 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168
8506 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164
8507 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160
8508 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156
8509 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152
8510 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148
8511 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
8512 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144
8513 ; GFX9-NEXT: s_waitcnt vmcnt(44)
8514 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24
8515 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140
8516 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136
8517 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
8518 ; GFX9-NEXT: s_waitcnt vmcnt(43)
8519 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27
8520 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132
8521 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128
8522 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
8523 ; GFX9-NEXT: s_waitcnt vmcnt(38)
8524 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30
8525 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124
8526 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120
8527 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
8528 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29
8529 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116
8530 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112
8531 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16
8532 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25
8533 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
8534 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26
8535 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2
8536 ; GFX9-NEXT: s_waitcnt vmcnt(41)
8537 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31
8538 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28
8539 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
8540 ; GFX9-NEXT: s_waitcnt vmcnt(40)
8541 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32
8542 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108
8543 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104
8544 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18
8545 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
8546 ; GFX9-NEXT: s_waitcnt vmcnt(41)
8547 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33
8548 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
8549 ; GFX9-NEXT: s_waitcnt vmcnt(40)
8550 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34
8551 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
8552 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
8553 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
8554 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
8555 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
8556 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
8557 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
8558 ; GFX9-NEXT: s_waitcnt vmcnt(41)
8559 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7
8560 ; GFX9-NEXT: s_waitcnt vmcnt(40)
8561 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
8562 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
8563 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8564 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
8565 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
8566 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8567 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
8568 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
8569 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
8570 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
8571 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
8572 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
8573 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22
8574 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
8575 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
8576 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
8577 ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
8578 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
8579 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
8580 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44
8581 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40
8582 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36
8583 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32
8584 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28
8585 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
8586 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
8587 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16
8588 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12
8589 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8
8590 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4
8591 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen
8592 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8593 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8595 ; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
8597 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8598 ; GFX10-NEXT: s_clause 0x1f
8599 ; GFX10-NEXT: global_load_ushort v3, v[1:2], off
8600 ; GFX10-NEXT: global_load_ushort v4, v[1:2], off offset:2
8601 ; GFX10-NEXT: global_load_ushort v5, v[1:2], off offset:4
8602 ; GFX10-NEXT: global_load_ushort v6, v[1:2], off offset:6
8603 ; GFX10-NEXT: global_load_ushort v7, v[1:2], off offset:8
8604 ; GFX10-NEXT: global_load_ushort v8, v[1:2], off offset:10
8605 ; GFX10-NEXT: global_load_ushort v9, v[1:2], off offset:12
8606 ; GFX10-NEXT: global_load_ushort v10, v[1:2], off offset:14
8607 ; GFX10-NEXT: global_load_ushort v11, v[1:2], off offset:16
8608 ; GFX10-NEXT: global_load_ushort v12, v[1:2], off offset:18
8609 ; GFX10-NEXT: global_load_ushort v13, v[1:2], off offset:20
8610 ; GFX10-NEXT: global_load_ushort v14, v[1:2], off offset:22
8611 ; GFX10-NEXT: global_load_ushort v15, v[1:2], off offset:24
8612 ; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26
8613 ; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28
8614 ; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30
8615 ; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62
8616 ; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32
8617 ; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34
8618 ; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36
8619 ; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60
8620 ; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38
8621 ; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40
8622 ; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58
8623 ; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42
8624 ; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44
8625 ; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56
8626 ; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46
8627 ; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48
8628 ; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54
8629 ; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50
8630 ; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52
8631 ; GFX10-NEXT: s_waitcnt vmcnt(31)
8632 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3
8633 ; GFX10-NEXT: s_waitcnt vmcnt(30)
8634 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
8635 ; GFX10-NEXT: s_waitcnt vmcnt(29)
8636 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5
8637 ; GFX10-NEXT: s_waitcnt vmcnt(28)
8638 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6
8639 ; GFX10-NEXT: s_waitcnt vmcnt(27)
8640 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7
8641 ; GFX10-NEXT: s_waitcnt vmcnt(26)
8642 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8
8643 ; GFX10-NEXT: s_waitcnt vmcnt(25)
8644 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9
8645 ; GFX10-NEXT: s_waitcnt vmcnt(24)
8646 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
8647 ; GFX10-NEXT: s_waitcnt vmcnt(23)
8648 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11
8649 ; GFX10-NEXT: s_waitcnt vmcnt(22)
8650 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12
8651 ; GFX10-NEXT: s_waitcnt vmcnt(21)
8652 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
8653 ; GFX10-NEXT: s_waitcnt vmcnt(20)
8654 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14
8655 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35
8656 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36
8657 ; GFX10-NEXT: s_waitcnt vmcnt(17)
8658 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17
8659 ; GFX10-NEXT: s_waitcnt vmcnt(16)
8660 ; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18
8661 ; GFX10-NEXT: s_waitcnt vmcnt(15)
8662 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19
8663 ; GFX10-NEXT: s_waitcnt vmcnt(14)
8664 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20
8665 ; GFX10-NEXT: s_waitcnt vmcnt(13)
8666 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21
8667 ; GFX10-NEXT: s_waitcnt vmcnt(12)
8668 ; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22
8669 ; GFX10-NEXT: s_waitcnt vmcnt(11)
8670 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23
8671 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8672 ; GFX10-NEXT: s_waitcnt vmcnt(9)
8673 ; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25
8674 ; GFX10-NEXT: s_waitcnt vmcnt(8)
8675 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26
8676 ; GFX10-NEXT: s_waitcnt vmcnt(7)
8677 ; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27
8678 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8679 ; GFX10-NEXT: s_waitcnt vmcnt(5)
8680 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29
8681 ; GFX10-NEXT: s_waitcnt vmcnt(4)
8682 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30
8683 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
8684 ; GFX10-NEXT: s_waitcnt vmcnt(2)
8685 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32
8686 ; GFX10-NEXT: s_waitcnt vmcnt(1)
8687 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33
8688 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
8689 ; GFX10-NEXT: s_waitcnt vmcnt(0)
8690 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34
8691 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31
8692 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
8693 ; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28
8694 ; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24
8695 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
8696 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71
8697 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68
8698 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16
8699 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70
8700 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15
8701 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
8702 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
8703 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23
8704 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37
8705 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38
8706 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
8707 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240
8708 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25
8709 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66
8710 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
8711 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232
8712 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27
8713 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48
8714 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
8715 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224
8716 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81
8717 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49
8718 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
8719 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
8720 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80
8721 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212
8722 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208
8723 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69
8724 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64
8725 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50
8726 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51
8727 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54
8728 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
8729 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
8730 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67
8731 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39
8732 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196
8733 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192
8734 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65
8735 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
8736 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
8737 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55
8738 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180
8739 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176
8740 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53
8741 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172
8742 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168
8743 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52
8744 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
8745 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160
8746 ; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156
8747 ; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
8748 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
8749 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
8750 ; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
8751 ; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136
8752 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132
8753 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
8754 ; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
8755 ; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
8756 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116
8757 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112
8758 ; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
8759 ; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
8760 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
8761 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
8762 ; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92
8763 ; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88
8764 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84
8765 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
8766 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76
8767 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72
8768 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68
8769 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64
8770 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60
8771 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56
8772 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52
8773 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48
8774 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
8775 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
8776 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
8777 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
8778 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
8779 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
8780 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
8781 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
8782 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
8783 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
8784 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4
8785 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen
8786 ; GFX10-NEXT: s_setpc_b64 s[30:31]
8788 ; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
8790 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8791 ; GFX11-NEXT: s_clause 0x1f
8792 ; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:12
8793 ; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:8
8794 ; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:4
8795 ; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:2
8796 ; GFX11-NEXT: global_load_u16 v7, v[1:2], off
8797 ; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6
8798 ; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10
8799 ; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14
8800 ; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:28
8801 ; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:24
8802 ; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:20
8803 ; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:18
8804 ; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16
8805 ; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22
8806 ; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26
8807 ; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30
8808 ; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:44
8809 ; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:40
8810 ; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:36
8811 ; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:34
8812 ; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32
8813 ; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38
8814 ; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42
8815 ; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46
8816 ; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:60
8817 ; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:56
8818 ; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:52
8819 ; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:50
8820 ; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48
8821 ; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
8822 ; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
8823 ; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
8824 ; GFX11-NEXT: s_waitcnt vmcnt(31)
8825 ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3
8826 ; GFX11-NEXT: s_waitcnt vmcnt(30)
8827 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4
8828 ; GFX11-NEXT: s_waitcnt vmcnt(29)
8829 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
8830 ; GFX11-NEXT: s_waitcnt vmcnt(28)
8831 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6
8832 ; GFX11-NEXT: s_waitcnt vmcnt(27)
8833 ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7
8834 ; GFX11-NEXT: s_waitcnt vmcnt(26)
8835 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8
8836 ; GFX11-NEXT: s_waitcnt vmcnt(25)
8837 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
8838 ; GFX11-NEXT: s_waitcnt vmcnt(24)
8839 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
8840 ; GFX11-NEXT: s_waitcnt vmcnt(23)
8841 ; GFX11-NEXT: v_lshlrev_b32_e32 v102, 16, v11
8842 ; GFX11-NEXT: s_waitcnt vmcnt(22)
8843 ; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v12
8844 ; GFX11-NEXT: s_waitcnt vmcnt(21)
8845 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
8846 ; GFX11-NEXT: s_waitcnt vmcnt(20)
8847 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
8848 ; GFX11-NEXT: s_waitcnt vmcnt(19)
8849 ; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v15
8850 ; GFX11-NEXT: s_waitcnt vmcnt(18)
8851 ; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v16
8852 ; GFX11-NEXT: s_waitcnt vmcnt(17)
8853 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
8854 ; GFX11-NEXT: s_waitcnt vmcnt(16)
8855 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
8856 ; GFX11-NEXT: s_waitcnt vmcnt(15)
8857 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v19
8858 ; GFX11-NEXT: s_waitcnt vmcnt(14)
8859 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20
8860 ; GFX11-NEXT: s_waitcnt vmcnt(13)
8861 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
8862 ; GFX11-NEXT: s_waitcnt vmcnt(12)
8863 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
8864 ; GFX11-NEXT: s_waitcnt vmcnt(11)
8865 ; GFX11-NEXT: v_lshlrev_b32_e32 v103, 16, v23
8866 ; GFX11-NEXT: s_waitcnt vmcnt(10)
8867 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24
8868 ; GFX11-NEXT: s_waitcnt vmcnt(9)
8869 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
8870 ; GFX11-NEXT: s_waitcnt vmcnt(8)
8871 ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
8872 ; GFX11-NEXT: s_waitcnt vmcnt(7)
8873 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v27
8874 ; GFX11-NEXT: s_waitcnt vmcnt(6)
8875 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28
8876 ; GFX11-NEXT: s_waitcnt vmcnt(5)
8877 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
8878 ; GFX11-NEXT: s_waitcnt vmcnt(4)
8879 ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
8880 ; GFX11-NEXT: s_waitcnt vmcnt(3)
8881 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31
8882 ; GFX11-NEXT: s_waitcnt vmcnt(2)
8883 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32
8884 ; GFX11-NEXT: s_waitcnt vmcnt(1)
8885 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33
8886 ; GFX11-NEXT: s_waitcnt vmcnt(0)
8887 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8888 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v68
8889 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v65
8890 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64
8891 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33
8892 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1
8893 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v29
8894 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v30
8895 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53
8896 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26
8897 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v52
8898 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25
8899 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v49
8900 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48
8901 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v21
8902 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v34
8903 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v22
8904 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v103
8905 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18
8906 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v102
8907 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17
8908 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v101
8909 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v13
8910 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v14
8911 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v100
8912 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10
8913 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v39
8914 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
8915 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v38
8916 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v6
8917 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
8918 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
8919 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
8920 ; GFX11-NEXT: s_clause 0xf
8921 ; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240
8922 ; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224
8923 ; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208
8924 ; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192
8925 ; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176
8926 ; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
8927 ; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
8928 ; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
8929 ; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
8930 ; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
8931 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
8932 ; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
8933 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
8934 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
8935 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
8936 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
8937 ; GFX11-NEXT: s_setpc_b64 s[30:31]
8938 %load = load <32 x bfloat>, ptr addrspace(1) %ptr
8939 %fpext = fpext <32 x bfloat> %load to <32 x double>
8940 ret <32 x double> %fpext
8943 define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
8944 ; GCN-LABEL: v_fadd_bf16:
8946 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8947 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
8948 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
8949 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
8950 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8951 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1
8952 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8953 ; GCN-NEXT: s_setpc_b64 s[30:31]
8955 ; GFX7-LABEL: v_fadd_bf16:
8957 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8958 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
8959 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
8960 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
8961 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8962 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
8963 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8964 ; GFX7-NEXT: s_setpc_b64 s[30:31]
8966 ; GFX8-LABEL: v_fadd_bf16:
8968 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8969 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8970 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8971 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
8972 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
8973 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
8974 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
8975 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
8976 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
8977 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
8978 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
8979 ; GFX8-NEXT: s_setpc_b64 s[30:31]
8981 ; GFX9-LABEL: v_fadd_bf16:
8983 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8984 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8985 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8986 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
8987 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
8988 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
8989 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
8990 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
8991 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
8992 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
8993 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
8994 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8996 ; GFX10-LABEL: v_fadd_bf16:
8998 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8999 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9000 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
9001 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
9002 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
9003 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
9004 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9005 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
9006 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
9007 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9008 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9010 ; GFX11-LABEL: v_fadd_bf16:
9012 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9013 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9014 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
9015 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9016 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
9017 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
9018 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
9019 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9020 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9021 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
9022 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
9023 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9024 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9025 ; GFX11-NEXT: s_setpc_b64 s[30:31]
9026 %op = fadd bfloat %a, %b
9030 define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
9031 ; GCN-LABEL: v_fadd_v2bf16:
9033 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9034 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9035 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9036 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9037 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9038 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9039 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9040 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9041 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9042 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3
9043 ; GCN-NEXT: v_add_f32_e32 v0, v0, v2
9044 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9045 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9046 ; GCN-NEXT: s_setpc_b64 s[30:31]
9048 ; GFX7-LABEL: v_fadd_v2bf16:
9050 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9051 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9052 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9053 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9054 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9055 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9056 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9057 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9058 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9059 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
9060 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
9061 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9062 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9063 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9065 ; GFX8-LABEL: v_fadd_v2bf16:
9067 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9068 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9069 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9070 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
9071 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
9072 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
9073 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9074 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9075 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
9076 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
9077 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
9078 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9079 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
9080 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
9081 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
9082 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
9083 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
9084 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9085 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
9086 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9087 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
9088 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9090 ; GFX9-LABEL: v_fadd_v2bf16:
9092 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9093 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9094 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9095 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
9096 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9097 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9098 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
9099 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9100 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
9101 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
9102 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
9103 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9104 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
9105 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
9106 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
9107 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
9108 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9109 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
9110 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9111 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
9112 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9114 ; GFX10-LABEL: v_fadd_v2bf16:
9116 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9117 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9118 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9119 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9120 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9121 ; GFX10-NEXT: v_add_f32_e32 v2, v3, v2
9122 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
9123 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
9124 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
9125 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
9126 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9127 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
9128 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
9129 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
9130 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
9131 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9132 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
9133 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
9134 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9136 ; GFX11-LABEL: v_fadd_v2bf16:
9138 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9139 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9140 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9141 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9142 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9143 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
9144 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
9145 ; GFX11-NEXT: v_add_f32_e32 v2, v3, v2
9146 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9147 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
9148 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
9149 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
9150 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9151 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
9152 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
9153 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
9154 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9155 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
9156 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9157 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
9158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9159 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
9160 ; GFX11-NEXT: s_setpc_b64 s[30:31]
9161 %op = fadd <2 x bfloat> %a, %b
9162 ret <2 x bfloat> %op
9165 define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
9166 ; GCN-LABEL: v_fadd_v3bf16:
9168 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9169 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9170 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9171 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9172 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
9173 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9174 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
9175 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9176 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9177 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9178 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9179 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9180 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9181 ; GCN-NEXT: v_add_f32_e32 v2, v2, v5
9182 ; GCN-NEXT: v_add_f32_e32 v1, v1, v4
9183 ; GCN-NEXT: v_add_f32_e32 v0, v0, v3
9184 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9185 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9186 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9187 ; GCN-NEXT: s_setpc_b64 s[30:31]
9189 ; GFX7-LABEL: v_fadd_v3bf16:
9191 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9192 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9193 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9194 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9195 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
9196 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9197 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
9198 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9199 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9200 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9201 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9202 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9203 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9204 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
9205 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
9206 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
9207 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9208 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9209 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9210 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9212 ; GFX8-LABEL: v_fadd_v3bf16:
9214 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9215 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9216 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9217 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
9218 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
9219 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
9220 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
9221 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
9222 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9223 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
9224 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9225 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
9226 ; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
9227 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
9228 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
9229 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
9230 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9231 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9232 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
9233 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
9234 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
9235 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9236 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
9237 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
9238 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
9239 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
9240 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
9241 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9242 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
9243 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9244 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
9245 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
9246 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9248 ; GFX9-LABEL: v_fadd_v3bf16:
9250 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9251 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9252 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9253 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
9254 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
9255 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9256 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
9257 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
9258 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9259 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
9260 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9261 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
9262 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
9263 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9264 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9265 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
9266 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
9267 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
9268 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
9269 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9270 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
9271 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
9272 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
9273 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
9274 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9275 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
9276 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9277 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
9278 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
9279 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9281 ; GFX10-LABEL: v_fadd_v3bf16:
9283 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9284 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
9285 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9286 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9287 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9288 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9289 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9290 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
9291 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
9292 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
9293 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
9294 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
9295 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
9296 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9297 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
9298 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
9299 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
9300 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
9301 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
9302 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9303 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9304 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9305 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9306 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9307 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
9308 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9309 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
9310 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9312 ; GFX11TRUE16-LABEL: v_fadd_v3bf16:
9313 ; GFX11TRUE16: ; %bb.0:
9314 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9315 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
9316 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9317 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9318 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9319 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9320 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
9321 ; GFX11TRUE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
9322 ; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
9323 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9324 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
9325 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
9326 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9327 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
9328 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
9329 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
9330 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
9331 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
9332 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
9333 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9334 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9335 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9336 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9337 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9338 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9339 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
9340 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9341 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
9342 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
9343 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
9345 ; GFX11FAKE16-LABEL: v_fadd_v3bf16:
9346 ; GFX11FAKE16: ; %bb.0:
9347 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9348 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
9349 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9350 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9351 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9352 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9353 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
9354 ; GFX11FAKE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
9355 ; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
9356 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9357 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
9358 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
9359 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9360 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
9361 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
9362 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
9363 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
9364 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
9365 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
9366 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9367 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9368 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9369 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9370 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9371 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9372 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
9373 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9374 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
9375 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
9376 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
9377 %op = fadd <3 x bfloat> %a, %b
9378 ret <3 x bfloat> %op
9381 define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
9382 ; GCN-LABEL: v_fadd_v4bf16:
9384 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9385 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9386 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
9387 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9388 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
9389 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9390 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
9391 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9392 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
9393 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9394 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9395 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9396 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9397 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9398 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9399 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9400 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9401 ; GCN-NEXT: v_add_f32_e32 v3, v3, v7
9402 ; GCN-NEXT: v_add_f32_e32 v2, v2, v6
9403 ; GCN-NEXT: v_add_f32_e32 v1, v1, v5
9404 ; GCN-NEXT: v_add_f32_e32 v0, v0, v4
9405 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9406 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9407 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9408 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9409 ; GCN-NEXT: s_setpc_b64 s[30:31]
9411 ; GFX7-LABEL: v_fadd_v4bf16:
9413 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9414 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9415 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
9416 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9417 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
9418 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9419 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
9420 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9421 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
9422 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9423 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9424 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9425 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9426 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9427 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9428 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9429 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9430 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
9431 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
9432 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
9433 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
9434 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9435 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9436 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9437 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9438 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9440 ; GFX8-LABEL: v_fadd_v4bf16:
9442 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9443 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9444 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9445 ; GFX8-NEXT: v_add_f32_e32 v4, v5, v4
9446 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
9447 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
9448 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9449 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9450 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
9451 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
9452 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
9453 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
9454 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
9455 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
9456 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
9457 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
9458 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
9459 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
9460 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9461 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
9462 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9463 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9464 ; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
9465 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
9466 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
9467 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9468 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9469 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
9470 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
9471 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
9472 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9473 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
9474 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
9475 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
9476 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
9477 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
9478 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9479 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
9480 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
9481 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9482 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
9483 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
9484 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9486 ; GFX9-LABEL: v_fadd_v4bf16:
9488 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9489 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9490 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9491 ; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
9492 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9493 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9494 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
9495 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9496 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
9497 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
9498 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
9499 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
9500 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
9501 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
9502 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
9503 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
9504 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9505 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
9506 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9507 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9508 ; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
9509 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9510 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9511 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
9512 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
9513 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
9514 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
9515 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9516 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
9517 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
9518 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
9519 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
9520 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9521 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
9522 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9523 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
9524 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
9525 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9527 ; GFX10-LABEL: v_fadd_v4bf16:
9529 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9530 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9531 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9532 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9533 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9534 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
9535 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
9536 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
9537 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9538 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9539 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
9540 ; GFX10-NEXT: v_add_f32_e32 v3, v7, v6
9541 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
9542 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
9543 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
9544 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9545 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
9546 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
9547 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
9548 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
9549 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
9550 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
9551 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
9552 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
9553 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9554 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
9555 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
9556 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
9557 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
9558 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9559 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
9560 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9561 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
9562 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
9563 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
9564 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9566 ; GFX11-LABEL: v_fadd_v4bf16:
9568 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9569 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
9570 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
9571 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9572 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9573 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9574 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9575 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9576 ; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
9577 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9578 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
9579 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9580 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
9581 ; GFX11-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
9582 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
9583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
9584 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
9585 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
9586 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
9587 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9588 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
9589 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
9590 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
9591 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
9592 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
9593 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
9594 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
9595 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9596 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
9597 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
9598 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
9599 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9600 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9601 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
9602 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9603 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
9604 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
9605 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9606 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
9607 ; GFX11-NEXT: s_setpc_b64 s[30:31]
9608 %op = fadd <4 x bfloat> %a, %b
9609 ret <4 x bfloat> %op
9612 define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
9613 ; GCN-LABEL: v_fadd_v8bf16:
9615 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9616 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9617 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
9618 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9619 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
9620 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9621 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
9622 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9623 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
9624 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
9625 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
9626 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
9627 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
9628 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
9629 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
9630 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
9631 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
9632 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
9633 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9634 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
9635 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9636 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
9637 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9638 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
9639 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9640 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
9641 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9642 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
9643 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9644 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
9645 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9646 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
9647 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9648 ; GCN-NEXT: v_add_f32_e32 v7, v7, v15
9649 ; GCN-NEXT: v_add_f32_e32 v6, v6, v14
9650 ; GCN-NEXT: v_add_f32_e32 v5, v5, v13
9651 ; GCN-NEXT: v_add_f32_e32 v4, v4, v12
9652 ; GCN-NEXT: v_add_f32_e32 v3, v3, v11
9653 ; GCN-NEXT: v_add_f32_e32 v2, v2, v10
9654 ; GCN-NEXT: v_add_f32_e32 v1, v1, v9
9655 ; GCN-NEXT: v_add_f32_e32 v0, v0, v8
9656 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9657 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9658 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9659 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9660 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9661 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9662 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9663 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9664 ; GCN-NEXT: s_setpc_b64 s[30:31]
9666 ; GFX7-LABEL: v_fadd_v8bf16:
9668 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9669 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9670 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
9671 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9672 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
9673 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9674 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
9675 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9676 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
9677 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
9678 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
9679 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
9680 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
9681 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
9682 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
9683 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
9684 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
9685 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
9686 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9687 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
9688 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9689 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
9690 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9691 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
9692 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9693 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
9694 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9695 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
9696 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9697 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
9698 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9699 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
9700 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9701 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v15
9702 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v14
9703 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v13
9704 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v12
9705 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v11
9706 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v10
9707 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
9708 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v8
9709 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9710 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9711 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9712 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9713 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9714 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9715 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9716 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9717 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9719 ; GFX8-LABEL: v_fadd_v8bf16:
9721 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9722 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9723 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9724 ; GFX8-NEXT: v_add_f32_e32 v8, v9, v8
9725 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
9726 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
9727 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9728 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9729 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
9730 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v7
9731 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
9732 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
9733 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
9734 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
9735 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
9736 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
9737 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
9738 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
9739 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9740 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
9741 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
9742 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
9743 ; GFX8-NEXT: v_add_f32_e32 v7, v9, v7
9744 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
9745 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
9746 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9747 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9748 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
9749 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
9750 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
9751 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
9752 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
9753 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
9754 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
9755 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
9756 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
9757 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9758 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
9759 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9760 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
9761 ; GFX8-NEXT: v_add_f32_e32 v6, v9, v6
9762 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
9763 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
9764 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9765 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9766 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
9767 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v5
9768 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
9769 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
9770 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
9771 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
9772 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
9773 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
9774 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
9775 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9776 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
9777 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
9778 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
9779 ; GFX8-NEXT: v_add_f32_e32 v5, v9, v5
9780 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
9781 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
9782 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9783 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9784 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
9785 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
9786 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
9787 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
9788 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
9789 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
9790 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
9791 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
9792 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
9793 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9794 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
9795 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
9796 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
9797 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
9798 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9799 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
9800 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
9801 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
9802 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
9803 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9805 ; GFX9-LABEL: v_fadd_v8bf16:
9807 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9808 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9809 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9810 ; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
9811 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9812 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9813 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
9814 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9815 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
9816 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
9817 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
9818 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
9819 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
9820 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
9821 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
9822 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
9823 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9824 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
9825 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
9826 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
9827 ; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
9828 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9829 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9830 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
9831 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
9832 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
9833 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
9834 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
9835 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
9836 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
9837 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
9838 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
9839 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9840 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
9841 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9842 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
9843 ; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
9844 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9845 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9846 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
9847 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
9848 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
9849 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
9850 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
9851 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
9852 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
9853 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
9854 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
9855 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9856 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
9857 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
9858 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
9859 ; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
9860 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9861 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9862 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
9863 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
9864 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
9865 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
9866 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
9867 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
9868 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
9869 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
9870 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
9871 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9872 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
9873 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9874 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
9875 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
9876 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
9877 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
9878 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9880 ; GFX10-LABEL: v_fadd_v8bf16:
9882 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9883 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9884 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9885 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9886 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9887 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
9888 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9889 ; GFX10-NEXT: v_add_f32_e32 v8, v9, v8
9890 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
9891 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9892 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
9893 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
9894 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
9895 ; GFX10-NEXT: v_add_f32_e32 v7, v10, v9
9896 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
9897 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9898 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
9899 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
9900 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
9901 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
9902 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9903 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
9904 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
9905 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
9906 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
9907 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
9908 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
9909 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
9910 ; GFX10-NEXT: v_add_f32_e32 v6, v10, v6
9911 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
9912 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9913 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9914 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
9915 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
9916 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
9917 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
9918 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9919 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9920 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9921 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
9922 ; GFX10-NEXT: v_add_f32_e32 v5, v15, v13
9923 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
9924 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
9925 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
9926 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
9927 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
9928 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
9929 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
9930 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
9931 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
9932 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
9933 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
9934 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
9935 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
9936 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
9937 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
9938 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
9939 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
9940 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
9941 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
9942 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9943 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
9944 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9945 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
9946 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
9947 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9948 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
9949 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
9950 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
9951 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9953 ; GFX11-LABEL: v_fadd_v8bf16:
9955 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9956 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
9957 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9958 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9959 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9960 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
9961 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9962 ; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
9963 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
9964 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9965 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9966 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
9967 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9968 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
9969 ; GFX11-NEXT: v_add_f32_e32 v7, v10, v9
9970 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
9971 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
9972 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9973 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
9974 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
9975 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
9976 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
9977 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
9978 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
9979 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
9980 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
9981 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
9982 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9983 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
9984 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9985 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9986 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
9987 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9988 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9989 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
9990 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9991 ; GFX11-NEXT: v_add_f32_e32 v6, v10, v6
9992 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
9993 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9994 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
9995 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
9996 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
9997 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
9998 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
9999 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
10000 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
10001 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10002 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10003 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
10004 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10005 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
10006 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
10007 ; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
10008 ; GFX11-NEXT: v_add_f32_e32 v5, v15, v13
10009 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10010 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
10011 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
10012 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
10013 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
10014 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10015 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
10016 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
10017 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
10018 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
10019 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
10020 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
10021 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
10022 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
10023 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
10024 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
10025 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
10026 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10027 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
10028 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
10029 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
10030 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
10031 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
10032 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
10033 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
10034 ; GFX11-NEXT: s_setpc_b64 s[30:31]
10035 %op = fadd <8 x bfloat> %a, %b
10036 ret <8 x bfloat> %op
10039 define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
10040 ; GCN-LABEL: v_fadd_v16bf16:
10042 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10043 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
10044 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
10045 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
10046 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10047 ; GCN-NEXT: v_add_f32_e32 v14, v14, v30
10048 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
10049 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
10050 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
10051 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10052 ; GCN-NEXT: v_add_f32_e32 v13, v13, v29
10053 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
10054 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
10055 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
10056 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10057 ; GCN-NEXT: v_add_f32_e32 v12, v12, v28
10058 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
10059 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
10060 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
10061 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
10062 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
10063 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
10064 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
10065 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
10066 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
10067 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
10068 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
10069 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
10070 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
10071 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
10072 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
10073 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
10074 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
10075 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
10076 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
10077 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
10078 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
10079 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
10080 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
10081 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
10082 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
10083 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
10084 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10085 ; GCN-NEXT: v_add_f32_e32 v11, v11, v27
10086 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
10087 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
10088 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10089 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
10090 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10091 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
10092 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10093 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
10094 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10095 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10096 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10097 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
10098 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10099 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
10100 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10101 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
10102 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10103 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
10104 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10105 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
10106 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10107 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10108 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10109 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10110 ; GCN-NEXT: v_add_f32_e32 v10, v10, v26
10111 ; GCN-NEXT: v_add_f32_e32 v9, v9, v25
10112 ; GCN-NEXT: v_add_f32_e32 v8, v8, v24
10113 ; GCN-NEXT: v_add_f32_e32 v7, v7, v23
10114 ; GCN-NEXT: v_add_f32_e32 v6, v6, v22
10115 ; GCN-NEXT: v_add_f32_e32 v5, v5, v21
10116 ; GCN-NEXT: v_add_f32_e32 v4, v4, v20
10117 ; GCN-NEXT: v_add_f32_e32 v3, v3, v19
10118 ; GCN-NEXT: v_add_f32_e32 v2, v2, v18
10119 ; GCN-NEXT: v_add_f32_e32 v1, v1, v17
10120 ; GCN-NEXT: v_add_f32_e32 v0, v0, v16
10121 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10122 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10123 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10124 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10125 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10126 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10127 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10128 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10129 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10130 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10131 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10132 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10133 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10134 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10135 ; GCN-NEXT: s_waitcnt vmcnt(0)
10136 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
10137 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10138 ; GCN-NEXT: v_add_f32_e32 v15, v15, v16
10139 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10140 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10141 ; GCN-NEXT: s_setpc_b64 s[30:31]
10143 ; GFX7-LABEL: v_fadd_v16bf16:
10145 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10146 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
10147 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
10148 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
10149 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10150 ; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
10151 ; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
10152 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
10153 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
10154 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10155 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10156 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
10157 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
10158 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
10159 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
10160 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
10161 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
10162 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
10163 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
10164 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
10165 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
10166 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
10167 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
10168 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
10169 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
10170 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
10171 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
10172 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
10173 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
10174 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
10175 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
10176 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
10177 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
10178 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
10179 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
10180 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
10181 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
10182 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
10183 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
10184 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
10185 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10186 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
10187 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10188 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
10189 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10190 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
10191 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10192 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
10193 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10194 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
10195 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10196 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
10197 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10198 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10199 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
10200 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10201 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
10202 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10203 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
10204 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10205 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
10206 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10207 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
10208 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10209 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10210 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10211 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
10212 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
10213 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
10214 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
10215 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
10216 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
10217 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v23
10218 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v21
10219 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v20
10220 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v19
10221 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v18
10222 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v17
10223 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v16
10224 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10225 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10226 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10227 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10228 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10229 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10230 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10231 ; GFX7-NEXT: s_waitcnt vmcnt(0)
10232 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
10233 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10234 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
10235 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10236 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10237 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10238 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10239 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10240 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10241 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10242 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10243 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10244 ; GFX7-NEXT: s_setpc_b64 s[30:31]
10246 ; GFX8-LABEL: v_fadd_v16bf16:
10248 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10249 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10250 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10251 ; GFX8-NEXT: v_add_f32_e32 v16, v17, v16
10252 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
10253 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
10254 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
10255 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10256 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10257 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10258 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v15
10259 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
10260 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
10261 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
10262 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
10263 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
10264 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
10265 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
10266 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
10267 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
10268 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
10269 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
10270 ; GFX8-NEXT: v_add_f32_e32 v15, v17, v15
10271 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
10272 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
10273 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10274 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10275 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10276 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v14
10277 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
10278 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
10279 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
10280 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
10281 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
10282 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
10283 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
10284 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
10285 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
10286 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
10287 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
10288 ; GFX8-NEXT: v_add_f32_e32 v14, v17, v14
10289 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
10290 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
10291 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10292 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10293 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10294 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v13
10295 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
10296 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
10297 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
10298 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
10299 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
10300 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
10301 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
10302 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
10303 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
10304 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
10305 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
10306 ; GFX8-NEXT: v_add_f32_e32 v13, v17, v13
10307 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
10308 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
10309 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10310 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10311 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10312 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v12
10313 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
10314 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
10315 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
10316 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
10317 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
10318 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
10319 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
10320 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
10321 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
10322 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10323 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
10324 ; GFX8-NEXT: v_add_f32_e32 v12, v17, v12
10325 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
10326 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
10327 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10328 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10329 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10330 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v11
10331 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
10332 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
10333 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
10334 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
10335 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
10336 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
10337 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
10338 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
10339 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
10340 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
10341 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
10342 ; GFX8-NEXT: v_add_f32_e32 v11, v17, v11
10343 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
10344 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
10345 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10346 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10347 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10348 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v10
10349 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
10350 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
10351 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
10352 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
10353 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
10354 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
10355 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
10356 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
10357 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
10358 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
10359 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
10360 ; GFX8-NEXT: v_add_f32_e32 v10, v17, v10
10361 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
10362 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
10363 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10364 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10365 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10366 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v9
10367 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
10368 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
10369 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
10370 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
10371 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
10372 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
10373 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
10374 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
10375 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
10376 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
10377 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
10378 ; GFX8-NEXT: v_add_f32_e32 v9, v17, v9
10379 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
10380 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
10381 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10382 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10383 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10384 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v8
10385 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
10386 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
10387 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
10388 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
10389 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
10390 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
10391 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
10392 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
10393 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
10394 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
10395 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
10396 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
10397 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
10398 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
10399 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
10400 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
10401 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
10402 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
10403 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
10404 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
10405 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
10406 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
10407 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
10408 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
10409 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
10410 ; GFX8-NEXT: s_setpc_b64 s[30:31]
10412 ; GFX9-LABEL: v_fadd_v16bf16:
10414 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10415 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10416 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10417 ; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
10418 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10419 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10420 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
10421 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
10422 ; GFX9-NEXT: v_add_f32_e32 v7, v7, v15
10423 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
10424 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
10425 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
10426 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
10427 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
10428 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
10429 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
10430 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
10431 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
10432 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
10433 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
10434 ; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
10435 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10436 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10437 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
10438 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v14
10439 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
10440 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
10441 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
10442 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
10443 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
10444 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
10445 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
10446 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
10447 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
10448 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
10449 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
10450 ; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
10451 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10452 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10453 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
10454 ; GFX9-NEXT: v_add_f32_e32 v5, v5, v13
10455 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
10456 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
10457 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
10458 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
10459 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
10460 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
10461 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
10462 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
10463 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
10464 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
10465 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
10466 ; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
10467 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10468 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10469 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
10470 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v12
10471 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
10472 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
10473 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
10474 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
10475 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
10476 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
10477 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
10478 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
10479 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
10480 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10481 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
10482 ; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
10483 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10484 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10485 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
10486 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v11
10487 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
10488 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
10489 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
10490 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
10491 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
10492 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
10493 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
10494 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
10495 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
10496 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
10497 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
10498 ; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
10499 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10500 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10501 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
10502 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v10
10503 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
10504 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
10505 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
10506 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
10507 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
10508 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
10509 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
10510 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
10511 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
10512 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
10513 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
10514 ; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
10515 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10516 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10517 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
10518 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v9
10519 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
10520 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
10521 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
10522 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
10523 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
10524 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
10525 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
10526 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
10527 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
10528 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
10529 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
10530 ; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
10531 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10532 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10533 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
10534 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
10535 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
10536 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
10537 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
10538 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
10539 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
10540 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
10541 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
10542 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
10543 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
10544 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
10545 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
10546 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
10547 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
10548 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
10549 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
10550 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
10551 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
10552 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
10553 ; GFX9-NEXT: s_setpc_b64 s[30:31]
10555 ; GFX10-LABEL: v_fadd_v16bf16:
10557 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10558 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10559 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10560 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10561 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10562 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
10563 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10564 ; GFX10-NEXT: v_add_f32_e32 v16, v17, v16
10565 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
10566 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v15
10567 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10568 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
10569 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
10570 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
10571 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
10572 ; GFX10-NEXT: v_add_f32_e32 v17, v18, v17
10573 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
10574 ; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
10575 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
10576 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
10577 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
10578 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
10579 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
10580 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
10581 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
10582 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
10583 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10584 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
10585 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
10586 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
10587 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10588 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10589 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
10590 ; GFX10-NEXT: v_add_f32_e32 v17, v20, v19
10591 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
10592 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v13
10593 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
10594 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
10595 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
10596 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
10597 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
10598 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
10599 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
10600 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10601 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10602 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
10603 ; GFX10-NEXT: v_add_f32_e32 v13, v19, v18
10604 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
10605 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
10606 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10607 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
10608 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
10609 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
10610 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v12
10611 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
10612 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
10613 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10614 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
10615 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
10616 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10617 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
10618 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
10619 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10620 ; GFX10-NEXT: v_add_f32_e32 v12, v18, v12
10621 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
10622 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
10623 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
10624 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
10625 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
10626 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
10627 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
10628 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
10629 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
10630 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
10631 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10632 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
10633 ; GFX10-NEXT: v_add_f32_e32 v18, v19, v18
10634 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10635 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
10636 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
10637 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
10638 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
10639 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
10640 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
10641 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
10642 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
10643 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
10644 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
10645 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10646 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
10647 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
10648 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
10649 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
10650 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10651 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
10652 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
10653 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
10654 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
10655 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
10656 ; GFX10-NEXT: v_add_f32_e32 v19, v22, v20
10657 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
10658 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
10659 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10660 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10661 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
10662 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
10663 ; GFX10-NEXT: v_add_f32_e32 v9, v22, v20
10664 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
10665 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v8
10666 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
10667 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
10668 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
10669 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
10670 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
10671 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
10672 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
10673 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
10674 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
10675 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
10676 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
10677 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
10678 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
10679 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
10680 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
10681 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
10682 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
10683 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
10684 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
10685 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
10686 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
10687 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
10688 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
10689 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
10690 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
10691 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
10692 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
10693 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
10694 ; GFX10-NEXT: s_setpc_b64 s[30:31]
10696 ; GFX11-LABEL: v_fadd_v16bf16:
10698 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10699 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
10700 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10701 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10702 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10703 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10704 ; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
10705 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
10706 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10707 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
10708 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10709 ; GFX11-NEXT: v_add_f32_e32 v17, v18, v17
10710 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v14
10711 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
10712 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10713 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
10714 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
10715 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10716 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10717 ; GFX11-NEXT: v_add_f32_e32 v7, v7, v15
10718 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
10719 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
10720 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
10721 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10722 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
10723 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
10724 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
10725 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
10726 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
10727 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
10728 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
10729 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
10730 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
10731 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10732 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
10733 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
10734 ; GFX11-NEXT: v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
10735 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
10736 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
10737 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
10738 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10739 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10740 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10741 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
10742 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
10743 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
10744 ; GFX11-NEXT: v_add_f32_e32 v4, v4, v12
10745 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10746 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10747 ; GFX11-NEXT: v_add_f32_e32 v5, v5, v13
10748 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
10749 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
10750 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
10751 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
10752 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
10753 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10754 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
10755 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
10756 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
10757 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
10758 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
10759 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
10760 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
10761 ; GFX11-NEXT: v_add_f32_e32 v12, v18, v12
10762 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10763 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
10764 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
10765 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
10766 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
10767 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10768 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
10769 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
10770 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
10771 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
10772 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
10773 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
10774 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
10775 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
10776 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
10777 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
10778 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
10779 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10780 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10781 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
10782 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
10783 ; GFX11-NEXT: v_add_f32_e32 v18, v19, v18
10784 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
10785 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
10786 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10787 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10788 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
10789 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
10790 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10791 ; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
10792 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v11
10793 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
10794 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
10795 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
10796 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
10797 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
10798 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
10799 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
10800 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
10801 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
10802 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
10803 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
10804 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
10805 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
10806 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
10807 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
10808 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
10809 ; GFX11-NEXT: v_add_f32_e32 v19, v22, v20
10810 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
10811 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
10812 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10813 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
10814 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
10815 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10816 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10817 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
10818 ; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9
10819 ; GFX11-NEXT: v_add_f32_e32 v9, v22, v20
10820 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10821 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
10822 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
10823 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
10824 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
10825 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
10826 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
10827 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
10828 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
10829 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
10830 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
10831 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
10832 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
10833 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10834 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
10835 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
10836 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
10837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10838 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
10839 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
10840 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
10841 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
10842 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
10843 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10844 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
10845 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
10846 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
10847 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
10848 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
10849 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
10850 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
10851 ; GFX11-NEXT: s_setpc_b64 s[30:31]
10852 %op = fadd <16 x bfloat> %a, %b
10853 ret <16 x bfloat> %op
10856 define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
10857 ; GCN-LABEL: v_fadd_v32bf16:
10859 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10860 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
10861 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
10862 ; GCN-NEXT: s_waitcnt vmcnt(1)
10863 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
10864 ; GCN-NEXT: s_waitcnt vmcnt(0)
10865 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
10866 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10867 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
10868 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
10869 ; GCN-NEXT: v_add_f32_e32 v31, v31, v32
10870 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
10871 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
10872 ; GCN-NEXT: s_waitcnt vmcnt(0)
10873 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10874 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10875 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
10876 ; GCN-NEXT: v_add_f32_e32 v30, v30, v32
10877 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
10878 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
10879 ; GCN-NEXT: s_waitcnt vmcnt(0)
10880 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10881 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10882 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
10883 ; GCN-NEXT: v_add_f32_e32 v29, v29, v32
10884 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
10885 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
10886 ; GCN-NEXT: s_waitcnt vmcnt(0)
10887 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10888 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10889 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
10890 ; GCN-NEXT: v_add_f32_e32 v28, v28, v32
10891 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
10892 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
10893 ; GCN-NEXT: s_waitcnt vmcnt(0)
10894 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10895 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10896 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
10897 ; GCN-NEXT: v_add_f32_e32 v27, v27, v32
10898 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
10899 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
10900 ; GCN-NEXT: s_waitcnt vmcnt(0)
10901 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10902 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10903 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
10904 ; GCN-NEXT: v_add_f32_e32 v26, v26, v32
10905 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
10906 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
10907 ; GCN-NEXT: s_waitcnt vmcnt(0)
10908 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10909 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10910 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
10911 ; GCN-NEXT: v_add_f32_e32 v25, v25, v32
10912 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
10913 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
10914 ; GCN-NEXT: s_waitcnt vmcnt(0)
10915 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10916 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10917 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
10918 ; GCN-NEXT: v_add_f32_e32 v24, v24, v32
10919 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
10920 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
10921 ; GCN-NEXT: s_waitcnt vmcnt(0)
10922 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10923 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10924 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
10925 ; GCN-NEXT: v_add_f32_e32 v23, v23, v32
10926 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
10927 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10928 ; GCN-NEXT: s_waitcnt vmcnt(0)
10929 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10930 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10931 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
10932 ; GCN-NEXT: v_add_f32_e32 v22, v22, v32
10933 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
10934 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
10935 ; GCN-NEXT: s_waitcnt vmcnt(0)
10936 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10937 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10938 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
10939 ; GCN-NEXT: v_add_f32_e32 v21, v21, v32
10940 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
10941 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
10942 ; GCN-NEXT: s_waitcnt vmcnt(0)
10943 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10944 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10945 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
10946 ; GCN-NEXT: v_add_f32_e32 v20, v20, v32
10947 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
10948 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
10949 ; GCN-NEXT: s_waitcnt vmcnt(0)
10950 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10951 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10952 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
10953 ; GCN-NEXT: v_add_f32_e32 v19, v19, v32
10954 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
10955 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
10956 ; GCN-NEXT: s_waitcnt vmcnt(0)
10957 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10958 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10959 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
10960 ; GCN-NEXT: v_add_f32_e32 v18, v18, v32
10961 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
10962 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
10963 ; GCN-NEXT: s_waitcnt vmcnt(0)
10964 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10965 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10966 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
10967 ; GCN-NEXT: v_add_f32_e32 v17, v17, v32
10968 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
10969 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10970 ; GCN-NEXT: s_waitcnt vmcnt(0)
10971 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10972 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10973 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
10974 ; GCN-NEXT: v_add_f32_e32 v16, v16, v32
10975 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
10976 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10977 ; GCN-NEXT: s_waitcnt vmcnt(0)
10978 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10979 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10980 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
10981 ; GCN-NEXT: v_add_f32_e32 v15, v15, v32
10982 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
10983 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10984 ; GCN-NEXT: s_waitcnt vmcnt(0)
10985 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10986 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10987 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
10988 ; GCN-NEXT: v_add_f32_e32 v14, v14, v32
10989 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
10990 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10991 ; GCN-NEXT: s_waitcnt vmcnt(0)
10992 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10993 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10994 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
10995 ; GCN-NEXT: v_add_f32_e32 v13, v13, v32
10996 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
10997 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10998 ; GCN-NEXT: s_waitcnt vmcnt(0)
10999 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11000 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11001 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
11002 ; GCN-NEXT: v_add_f32_e32 v12, v12, v32
11003 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
11004 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11005 ; GCN-NEXT: s_waitcnt vmcnt(0)
11006 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11007 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11008 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
11009 ; GCN-NEXT: v_add_f32_e32 v11, v11, v32
11010 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
11011 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11012 ; GCN-NEXT: s_waitcnt vmcnt(0)
11013 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11014 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11015 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
11016 ; GCN-NEXT: v_add_f32_e32 v10, v10, v32
11017 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
11018 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11019 ; GCN-NEXT: s_waitcnt vmcnt(0)
11020 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11021 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11022 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
11023 ; GCN-NEXT: v_add_f32_e32 v9, v9, v32
11024 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
11025 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11026 ; GCN-NEXT: s_waitcnt vmcnt(0)
11027 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11028 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11029 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
11030 ; GCN-NEXT: v_add_f32_e32 v8, v8, v32
11031 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
11032 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11033 ; GCN-NEXT: s_waitcnt vmcnt(0)
11034 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11035 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11036 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
11037 ; GCN-NEXT: v_add_f32_e32 v7, v7, v32
11038 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
11039 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11040 ; GCN-NEXT: s_waitcnt vmcnt(0)
11041 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11042 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11043 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
11044 ; GCN-NEXT: v_add_f32_e32 v6, v6, v32
11045 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
11046 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11047 ; GCN-NEXT: s_waitcnt vmcnt(0)
11048 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11049 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11050 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
11051 ; GCN-NEXT: v_add_f32_e32 v5, v5, v32
11052 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
11053 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11054 ; GCN-NEXT: s_waitcnt vmcnt(0)
11055 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11056 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11057 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
11058 ; GCN-NEXT: v_add_f32_e32 v4, v4, v32
11059 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
11060 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11061 ; GCN-NEXT: s_waitcnt vmcnt(0)
11062 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11063 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11064 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
11065 ; GCN-NEXT: v_add_f32_e32 v3, v3, v32
11066 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
11067 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11068 ; GCN-NEXT: s_waitcnt vmcnt(0)
11069 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11070 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11071 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
11072 ; GCN-NEXT: v_add_f32_e32 v2, v2, v32
11073 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
11074 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11075 ; GCN-NEXT: s_waitcnt vmcnt(0)
11076 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11077 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11078 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
11079 ; GCN-NEXT: v_add_f32_e32 v1, v1, v32
11080 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
11081 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11082 ; GCN-NEXT: s_waitcnt vmcnt(0)
11083 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11084 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11085 ; GCN-NEXT: v_add_f32_e32 v0, v0, v32
11086 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11087 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11088 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11089 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11090 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11091 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11092 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11093 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11094 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11095 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11096 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11097 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11098 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11099 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11100 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11101 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11102 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11103 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11104 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11105 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11106 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11107 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11108 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11109 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11110 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11111 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11112 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11113 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11114 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11115 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11116 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11117 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
11118 ; GCN-NEXT: s_setpc_b64 s[30:31]
11120 ; GFX7-LABEL: v_fadd_v32bf16:
11122 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11123 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
11124 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
11125 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
11126 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11127 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
11128 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11129 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
11130 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11131 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
11132 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11133 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
11134 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11135 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
11136 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11137 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
11138 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11139 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
11140 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11141 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
11142 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11143 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
11144 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11145 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
11146 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11147 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
11148 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11149 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
11150 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11151 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
11152 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11153 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
11154 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11155 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
11156 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11157 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
11158 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11159 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
11160 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11161 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
11162 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11163 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
11164 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11165 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
11166 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11167 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
11168 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11169 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
11170 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11171 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
11172 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11173 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
11174 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11175 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
11176 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11177 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
11178 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11179 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
11180 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11181 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
11182 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11183 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
11184 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11185 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
11186 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11187 ; GFX7-NEXT: s_waitcnt vmcnt(1)
11188 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
11189 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11190 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11191 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11192 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
11193 ; GFX7-NEXT: v_add_f32_e32 v31, v31, v32
11194 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
11195 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
11196 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11197 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11198 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11199 ; GFX7-NEXT: v_add_f32_e32 v30, v30, v32
11200 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
11201 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11202 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11203 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11204 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11205 ; GFX7-NEXT: v_add_f32_e32 v29, v29, v32
11206 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
11207 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11208 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11209 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11210 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11211 ; GFX7-NEXT: v_add_f32_e32 v28, v28, v32
11212 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
11213 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11214 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11215 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11216 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11217 ; GFX7-NEXT: v_add_f32_e32 v27, v27, v32
11218 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
11219 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11220 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11221 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11222 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11223 ; GFX7-NEXT: v_add_f32_e32 v26, v26, v32
11224 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
11225 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11226 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11227 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11228 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11229 ; GFX7-NEXT: v_add_f32_e32 v25, v25, v32
11230 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
11231 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11232 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11233 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11234 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11235 ; GFX7-NEXT: v_add_f32_e32 v24, v24, v32
11236 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
11237 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11238 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11239 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11240 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11241 ; GFX7-NEXT: v_add_f32_e32 v23, v23, v32
11242 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
11243 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11244 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11245 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11246 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11247 ; GFX7-NEXT: v_add_f32_e32 v22, v22, v32
11248 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
11249 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11250 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11251 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11252 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11253 ; GFX7-NEXT: v_add_f32_e32 v21, v21, v32
11254 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
11255 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11256 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11257 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11258 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11259 ; GFX7-NEXT: v_add_f32_e32 v20, v20, v32
11260 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
11261 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11262 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11263 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11264 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11265 ; GFX7-NEXT: v_add_f32_e32 v19, v19, v32
11266 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
11267 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11268 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11269 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11270 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11271 ; GFX7-NEXT: v_add_f32_e32 v18, v18, v32
11272 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
11273 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11274 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11275 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11276 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11277 ; GFX7-NEXT: v_add_f32_e32 v17, v17, v32
11278 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
11279 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11280 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11281 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11282 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11283 ; GFX7-NEXT: v_add_f32_e32 v16, v16, v32
11284 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
11285 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11286 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11287 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11288 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11289 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v32
11290 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
11291 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11292 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11293 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11294 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11295 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v32
11296 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
11297 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11298 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11299 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11300 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11301 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v32
11302 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
11303 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11304 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11305 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11306 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11307 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v32
11308 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
11309 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11310 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11311 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11312 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11313 ; GFX7-NEXT: v_add_f32_e32 v11, v11, v32
11314 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
11315 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11316 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11317 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11318 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11319 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v32
11320 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
11321 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11322 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11323 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11324 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11325 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v32
11326 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
11327 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11328 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11329 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11330 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11331 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v32
11332 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
11333 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11334 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11335 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11336 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11337 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v32
11338 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
11339 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11340 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11341 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11342 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11343 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v32
11344 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
11345 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11346 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11347 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11348 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11349 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v32
11350 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
11351 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11352 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11353 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11354 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11355 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v32
11356 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
11357 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11358 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11359 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11360 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11361 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v32
11362 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
11363 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11364 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11365 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11366 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11367 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v32
11368 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
11369 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11370 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11371 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11372 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11373 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v32
11374 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
11375 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11376 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11377 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11378 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11379 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v32
11380 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11381 ; GFX7-NEXT: s_setpc_b64 s[30:31]
11383 ; GFX8-LABEL: v_fadd_v32bf16:
11385 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11386 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
11387 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
11388 ; GFX8-NEXT: v_add_f32_e32 v31, v32, v31
11389 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
11390 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
11391 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
11392 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11393 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11394 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
11395 ; GFX8-NEXT: v_add_f32_e32 v14, v14, v30
11396 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
11397 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
11398 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
11399 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
11400 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
11401 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
11402 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
11403 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
11404 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
11405 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
11406 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
11407 ; GFX8-NEXT: v_add_f32_e32 v32, v32, v30
11408 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
11409 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
11410 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11411 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11412 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11413 ; GFX8-NEXT: v_add_f32_e32 v13, v13, v29
11414 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
11415 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
11416 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
11417 ; GFX8-NEXT: s_waitcnt vmcnt(0)
11418 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
11419 ; GFX8-NEXT: v_add_f32_e32 v33, v33, v34
11420 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11421 ; GFX8-NEXT: v_add_f32_e32 v30, v15, v30
11422 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
11423 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
11424 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
11425 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
11426 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
11427 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
11428 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
11429 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
11430 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11431 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
11432 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
11433 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
11434 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
11435 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
11436 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11437 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
11438 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
11439 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
11440 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
11441 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
11442 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
11443 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
11444 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
11445 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
11446 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
11447 ; GFX8-NEXT: v_add_f32_e32 v29, v33, v29
11448 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
11449 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
11450 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11451 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11452 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11453 ; GFX8-NEXT: v_add_f32_e32 v12, v12, v28
11454 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
11455 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
11456 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
11457 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
11458 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
11459 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
11460 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
11461 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
11462 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
11463 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
11464 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
11465 ; GFX8-NEXT: v_add_f32_e32 v28, v33, v28
11466 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
11467 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
11468 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11469 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11470 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11471 ; GFX8-NEXT: v_add_f32_e32 v11, v11, v27
11472 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
11473 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
11474 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
11475 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
11476 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
11477 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
11478 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
11479 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
11480 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
11481 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
11482 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
11483 ; GFX8-NEXT: v_add_f32_e32 v27, v33, v27
11484 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
11485 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
11486 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11487 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11488 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11489 ; GFX8-NEXT: v_add_f32_e32 v10, v10, v26
11490 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
11491 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
11492 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
11493 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
11494 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
11495 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
11496 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
11497 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
11498 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
11499 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
11500 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
11501 ; GFX8-NEXT: v_add_f32_e32 v26, v33, v26
11502 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
11503 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
11504 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11505 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11506 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11507 ; GFX8-NEXT: v_add_f32_e32 v9, v9, v25
11508 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
11509 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
11510 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
11511 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
11512 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
11513 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
11514 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
11515 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
11516 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
11517 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
11518 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
11519 ; GFX8-NEXT: v_add_f32_e32 v25, v33, v25
11520 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
11521 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
11522 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11523 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11524 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11525 ; GFX8-NEXT: v_add_f32_e32 v8, v8, v24
11526 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
11527 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
11528 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
11529 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
11530 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
11531 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
11532 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
11533 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
11534 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
11535 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
11536 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
11537 ; GFX8-NEXT: v_add_f32_e32 v24, v33, v24
11538 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
11539 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
11540 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11541 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11542 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11543 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v23
11544 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
11545 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
11546 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
11547 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
11548 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
11549 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
11550 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
11551 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
11552 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
11553 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
11554 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
11555 ; GFX8-NEXT: v_add_f32_e32 v23, v33, v23
11556 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
11557 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
11558 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11559 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11560 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11561 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v22
11562 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
11563 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
11564 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
11565 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
11566 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
11567 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
11568 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
11569 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
11570 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
11571 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
11572 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
11573 ; GFX8-NEXT: v_add_f32_e32 v22, v33, v22
11574 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
11575 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
11576 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11577 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11578 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11579 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v21
11580 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
11581 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
11582 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
11583 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
11584 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
11585 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
11586 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
11587 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
11588 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
11589 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
11590 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
11591 ; GFX8-NEXT: v_add_f32_e32 v21, v33, v21
11592 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
11593 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
11594 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11595 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11596 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11597 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v20
11598 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
11599 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
11600 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
11601 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
11602 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
11603 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
11604 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
11605 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
11606 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
11607 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
11608 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
11609 ; GFX8-NEXT: v_add_f32_e32 v20, v33, v20
11610 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
11611 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
11612 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11613 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11614 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11615 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v19
11616 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
11617 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
11618 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
11619 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
11620 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
11621 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
11622 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
11623 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
11624 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
11625 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
11626 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
11627 ; GFX8-NEXT: v_add_f32_e32 v19, v33, v19
11628 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
11629 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
11630 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11631 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11632 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11633 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v18
11634 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
11635 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
11636 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
11637 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
11638 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
11639 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
11640 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
11641 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
11642 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
11643 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
11644 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
11645 ; GFX8-NEXT: v_add_f32_e32 v18, v33, v18
11646 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
11647 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
11648 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11649 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11650 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11651 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v17
11652 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
11653 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
11654 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
11655 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
11656 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
11657 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
11658 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
11659 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
11660 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
11661 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
11662 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
11663 ; GFX8-NEXT: v_add_f32_e32 v17, v33, v17
11664 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
11665 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
11666 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11667 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11668 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11669 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v16
11670 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
11671 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
11672 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
11673 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
11674 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
11675 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
11676 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
11677 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
11678 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
11679 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
11680 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
11681 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
11682 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
11683 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
11684 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
11685 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
11686 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
11687 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
11688 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
11689 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
11690 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
11691 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
11692 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
11693 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
11694 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
11695 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
11696 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
11697 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
11698 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
11699 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
11700 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
11701 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
11702 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
11703 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
11704 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
11705 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
11706 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
11707 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
11708 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
11709 ; GFX8-NEXT: s_setpc_b64 s[30:31]
11711 ; GFX9-LABEL: v_fadd_v32bf16:
11713 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11714 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
11715 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
11716 ; GFX9-NEXT: v_add_f32_e32 v31, v32, v31
11717 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11718 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11719 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
11720 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
11721 ; GFX9-NEXT: v_add_f32_e32 v14, v14, v30
11722 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
11723 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
11724 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
11725 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
11726 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
11727 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
11728 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
11729 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
11730 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
11731 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
11732 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
11733 ; GFX9-NEXT: v_add_f32_e32 v30, v32, v30
11734 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11735 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11736 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
11737 ; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
11738 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
11739 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
11740 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
11741 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
11742 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
11743 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
11744 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
11745 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
11746 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
11747 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
11748 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
11749 ; GFX9-NEXT: v_add_f32_e32 v32, v32, v29
11750 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
11751 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
11752 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11753 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11754 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11755 ; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
11756 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
11757 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
11758 ; GFX9-NEXT: s_waitcnt vmcnt(0)
11759 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
11760 ; GFX9-NEXT: v_add_f32_e32 v33, v33, v34
11761 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11762 ; GFX9-NEXT: v_add_f32_e32 v29, v15, v29
11763 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
11764 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
11765 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
11766 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
11767 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
11768 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
11769 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
11770 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
11771 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
11772 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
11773 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
11774 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
11775 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
11776 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
11777 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
11778 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
11779 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
11780 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
11781 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
11782 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
11783 ; GFX9-NEXT: v_add_f32_e32 v28, v33, v28
11784 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11785 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11786 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
11787 ; GFX9-NEXT: v_add_f32_e32 v11, v11, v27
11788 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
11789 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
11790 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
11791 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
11792 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
11793 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
11794 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
11795 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
11796 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
11797 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
11798 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
11799 ; GFX9-NEXT: v_add_f32_e32 v27, v33, v27
11800 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11801 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11802 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
11803 ; GFX9-NEXT: v_add_f32_e32 v10, v10, v26
11804 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
11805 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
11806 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
11807 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
11808 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
11809 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
11810 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
11811 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
11812 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
11813 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
11814 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
11815 ; GFX9-NEXT: v_add_f32_e32 v26, v33, v26
11816 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11817 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11818 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
11819 ; GFX9-NEXT: v_add_f32_e32 v9, v9, v25
11820 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
11821 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
11822 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
11823 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
11824 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
11825 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
11826 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
11827 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
11828 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
11829 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
11830 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
11831 ; GFX9-NEXT: v_add_f32_e32 v25, v33, v25
11832 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11833 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11834 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
11835 ; GFX9-NEXT: v_add_f32_e32 v8, v8, v24
11836 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
11837 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
11838 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
11839 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
11840 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
11841 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
11842 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
11843 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
11844 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
11845 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
11846 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
11847 ; GFX9-NEXT: v_add_f32_e32 v24, v33, v24
11848 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11849 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11850 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
11851 ; GFX9-NEXT: v_add_f32_e32 v7, v7, v23
11852 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
11853 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
11854 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
11855 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
11856 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
11857 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
11858 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
11859 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
11860 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
11861 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
11862 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
11863 ; GFX9-NEXT: v_add_f32_e32 v23, v33, v23
11864 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11865 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11866 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
11867 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v22
11868 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
11869 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
11870 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
11871 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
11872 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
11873 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
11874 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
11875 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
11876 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
11877 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
11878 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
11879 ; GFX9-NEXT: v_add_f32_e32 v22, v33, v22
11880 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11881 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11882 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
11883 ; GFX9-NEXT: v_add_f32_e32 v5, v5, v21
11884 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
11885 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
11886 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
11887 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
11888 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
11889 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
11890 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
11891 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
11892 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
11893 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
11894 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
11895 ; GFX9-NEXT: v_add_f32_e32 v21, v33, v21
11896 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11897 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11898 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
11899 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v20
11900 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
11901 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
11902 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
11903 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
11904 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
11905 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
11906 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
11907 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
11908 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
11909 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
11910 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
11911 ; GFX9-NEXT: v_add_f32_e32 v20, v33, v20
11912 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11913 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11914 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
11915 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v19
11916 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
11917 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
11918 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
11919 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
11920 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
11921 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
11922 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
11923 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
11924 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
11925 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
11926 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
11927 ; GFX9-NEXT: v_add_f32_e32 v19, v33, v19
11928 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11929 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11930 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
11931 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v18
11932 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
11933 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
11934 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
11935 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
11936 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
11937 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
11938 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
11939 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
11940 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
11941 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
11942 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
11943 ; GFX9-NEXT: v_add_f32_e32 v18, v33, v18
11944 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11945 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11946 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
11947 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v17
11948 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
11949 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
11950 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
11951 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
11952 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
11953 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
11954 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
11955 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
11956 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
11957 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
11958 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
11959 ; GFX9-NEXT: v_add_f32_e32 v17, v33, v17
11960 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11961 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11962 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
11963 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v16
11964 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
11965 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
11966 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
11967 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
11968 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
11969 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
11970 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
11971 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
11972 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
11973 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
11974 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
11975 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
11976 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
11977 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
11978 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
11979 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
11980 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
11981 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
11982 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
11983 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
11984 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
11985 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
11986 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
11987 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
11988 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
11989 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
11990 ; GFX9-NEXT: s_setpc_b64 s[30:31]
11992 ; GFX10-LABEL: v_fadd_v32bf16:
11994 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11995 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
11996 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
11997 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
11998 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11999 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
12000 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
12001 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
12002 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
12003 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
12004 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
12005 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
12006 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
12007 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
12008 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
12009 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
12010 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
12011 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
12012 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
12013 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
12014 ; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
12015 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
12016 ; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
12017 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
12018 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
12019 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
12020 ; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
12021 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
12022 ; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
12023 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
12024 ; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
12025 ; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
12026 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
12027 ; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
12028 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
12029 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
12030 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
12031 ; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
12032 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
12033 ; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
12034 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
12035 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
12036 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
12037 ; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
12038 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
12039 ; GFX10-NEXT: v_add_f32_e32 v27, v50, v27
12040 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
12041 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
12042 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12043 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
12044 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
12045 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
12046 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
12047 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
12048 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
12049 ; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
12050 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
12051 ; GFX10-NEXT: v_add_f32_e32 v29, v38, v29
12052 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
12053 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
12054 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12055 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
12056 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
12057 ; GFX10-NEXT: v_add_f32_e32 v28, v48, v28
12058 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
12059 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
12060 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12061 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
12062 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
12063 ; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
12064 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
12065 ; GFX10-NEXT: v_add_f32_e32 v34, v34, v51
12066 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
12067 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
12068 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12069 ; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
12070 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
12071 ; GFX10-NEXT: v_add_f32_e32 v30, v36, v30
12072 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
12073 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
12074 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12075 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
12076 ; GFX10-NEXT: v_add_f32_e32 v18, v48, v23
12077 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
12078 ; GFX10-NEXT: v_add_f32_e32 v17, v50, v22
12079 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
12080 ; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
12081 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
12082 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
12083 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
12084 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12085 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
12086 ; GFX10-NEXT: v_add_f32_e32 v20, v36, v25
12087 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
12088 ; GFX10-NEXT: v_add_f32_e32 v19, v38, v24
12089 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
12090 ; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
12091 ; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
12092 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
12093 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
12094 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
12095 ; GFX10-NEXT: v_add_f32_e32 v21, v51, v26
12096 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
12097 ; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
12098 ; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
12099 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
12100 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
12101 ; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
12102 ; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
12103 ; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
12104 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
12105 ; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
12106 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
12107 ; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
12108 ; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
12109 ; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
12110 ; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
12111 ; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
12112 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
12113 ; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
12114 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
12115 ; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
12116 ; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
12117 ; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
12118 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
12119 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
12120 ; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
12121 ; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
12122 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
12123 ; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
12124 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
12125 ; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
12126 ; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
12127 ; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
12128 ; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
12129 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
12130 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
12131 ; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
12132 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
12133 ; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
12134 ; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
12135 ; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
12136 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
12137 ; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
12138 ; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
12139 ; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
12140 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
12141 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
12142 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
12143 ; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
12144 ; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
12145 ; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
12146 ; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
12147 ; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
12148 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
12149 ; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
12150 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
12151 ; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
12152 ; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
12153 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
12154 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
12155 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
12156 ; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
12157 ; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
12158 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
12159 ; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
12160 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
12161 ; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
12162 ; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
12163 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
12164 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
12165 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
12166 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
12167 ; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
12168 ; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
12169 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
12170 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
12171 ; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
12172 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
12173 ; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
12174 ; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
12175 ; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
12176 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
12177 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
12178 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
12179 ; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
12180 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
12181 ; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
12182 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
12183 ; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
12184 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
12185 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
12186 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
12187 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
12188 ; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
12189 ; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
12190 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
12191 ; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
12192 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
12193 ; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
12194 ; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
12195 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
12196 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
12197 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
12198 ; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
12199 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
12200 ; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
12201 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
12202 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
12203 ; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
12204 ; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
12205 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
12206 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
12207 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
12208 ; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
12209 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
12210 ; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
12211 ; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
12212 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
12213 ; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
12214 ; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
12215 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
12216 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
12217 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
12218 ; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
12219 ; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
12220 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
12221 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
12222 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
12223 ; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
12224 ; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
12225 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
12226 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
12227 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
12228 ; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
12229 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
12230 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12231 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
12232 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
12233 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12234 ; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
12235 ; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
12236 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
12237 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12238 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
12239 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
12240 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
12241 ; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
12242 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
12243 ; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
12244 ; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
12245 ; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
12246 ; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
12247 ; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
12248 ; GFX10-NEXT: s_waitcnt vmcnt(0)
12249 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
12250 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
12251 ; GFX10-NEXT: v_add_f32_e32 v17, v31, v17
12252 ; GFX10-NEXT: v_add_f32_e32 v15, v15, v18
12253 ; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
12254 ; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
12255 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
12256 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12257 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
12258 ; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
12259 ; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
12260 ; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
12261 ; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
12262 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
12263 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
12264 ; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
12265 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
12266 ; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
12267 ; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
12268 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
12269 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12271 ; GFX11-LABEL: v_fadd_v32bf16:
12273 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12274 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
12275 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
12276 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
12277 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
12278 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12279 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
12280 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
12281 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
12282 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12283 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
12284 ; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
12285 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
12286 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12287 ; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
12288 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
12289 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
12290 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
12291 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
12292 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
12293 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
12294 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
12295 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
12296 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
12297 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12298 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
12299 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
12300 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
12301 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12302 ; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
12303 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
12304 ; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
12305 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
12306 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
12307 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
12308 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12309 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
12310 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
12311 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
12312 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
12313 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
12314 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
12315 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
12316 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
12317 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12318 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
12319 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
12320 ; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
12321 ; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
12322 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
12323 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
12324 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12325 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
12326 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
12327 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
12328 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
12329 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
12330 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
12331 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
12332 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
12333 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
12334 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
12335 ; GFX11-NEXT: v_add_f32_e32 v2, v2, v18
12336 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v16
12337 ; GFX11-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
12338 ; GFX11-NEXT: v_add_f32_e32 v7, v7, v23
12339 ; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
12340 ; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
12341 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
12342 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
12343 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
12344 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
12345 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
12346 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
12347 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
12348 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
12349 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
12350 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
12351 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12352 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
12353 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
12354 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
12355 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
12356 ; GFX11-NEXT: v_add_f32_e32 v4, v4, v20
12357 ; GFX11-NEXT: v_add_f32_e32 v20, v80, v71
12358 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
12359 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
12360 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
12361 ; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
12362 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12363 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
12364 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
12365 ; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
12366 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
12367 ; GFX11-NEXT: v_add_f32_e32 v26, v52, v51
12368 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12369 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v22
12370 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
12371 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
12372 ; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
12373 ; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
12374 ; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
12375 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
12376 ; GFX11-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
12377 ; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
12378 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12379 ; GFX11-NEXT: v_add_f32_e32 v29, v38, v37
12380 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
12381 ; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
12382 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
12383 ; GFX11-NEXT: v_add_f32_e32 v14, v14, v30
12384 ; GFX11-NEXT: v_add_f32_e32 v28, v48, v39
12385 ; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
12386 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
12387 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
12388 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
12389 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12390 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
12391 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
12392 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
12393 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
12394 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
12395 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
12396 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
12397 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
12398 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
12399 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
12400 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
12401 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
12402 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
12403 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
12404 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
12405 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
12406 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
12407 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
12408 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
12409 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
12410 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
12411 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
12412 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
12413 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
12414 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
12415 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
12416 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
12417 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
12418 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
12419 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
12420 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
12421 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
12422 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
12423 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
12424 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
12425 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
12426 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
12427 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
12428 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
12429 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
12430 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
12431 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
12432 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
12433 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
12434 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
12435 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
12436 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
12437 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
12438 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
12439 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
12440 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
12441 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
12442 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
12443 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
12444 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
12445 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
12446 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
12447 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
12448 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
12449 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
12450 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
12451 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
12452 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
12453 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
12454 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
12455 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
12456 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
12457 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
12458 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
12459 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
12460 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
12461 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
12462 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
12463 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
12464 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
12465 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
12466 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
12467 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
12468 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
12469 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
12470 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
12471 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
12472 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
12473 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
12474 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
12475 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
12476 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
12477 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
12478 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
12479 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
12480 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
12481 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
12482 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
12483 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
12484 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
12485 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
12486 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
12487 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
12488 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
12489 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
12490 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12491 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
12492 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
12493 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
12494 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
12495 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
12496 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
12497 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
12498 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
12499 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
12500 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
12501 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12502 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
12503 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
12504 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
12505 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
12506 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
12507 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
12508 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
12509 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
12510 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
12511 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
12512 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
12513 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
12514 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
12515 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12516 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12517 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
12518 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
12519 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12520 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
12521 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12522 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
12523 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
12524 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
12525 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12526 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
12527 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
12528 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
12529 ; GFX11-NEXT: s_waitcnt vmcnt(0)
12530 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
12531 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12532 ; GFX11-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
12533 ; GFX11-NEXT: v_add_f32_e32 v15, v15, v18
12534 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12535 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
12536 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
12537 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
12538 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12539 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
12540 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
12541 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
12542 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12543 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
12544 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
12545 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
12546 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12547 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
12548 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12549 %op = fadd <32 x bfloat> %a, %b
12550 ret <32 x bfloat> %op
12553 define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
12554 ; GCN-LABEL: v_fadd_bf16_fpimm_0:
12556 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12557 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12558 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12559 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
12560 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12561 ; GCN-NEXT: s_setpc_b64 s[30:31]
12563 ; GFX7-LABEL: v_fadd_bf16_fpimm_0:
12565 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12566 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12567 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12568 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
12569 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12570 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12572 ; GFX8-LABEL: v_fadd_bf16_fpimm_0:
12574 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12575 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12576 ; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0
12577 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12578 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12579 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12580 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
12581 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12582 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12583 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12584 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12586 ; GFX9-LABEL: v_fadd_bf16_fpimm_0:
12588 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12589 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12590 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
12591 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12592 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12593 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12594 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
12595 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12596 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12597 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12598 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12600 ; GFX10-LABEL: v_fadd_bf16_fpimm_0:
12602 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12603 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12604 ; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0
12605 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
12606 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
12607 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12608 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12609 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12610 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12611 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12613 ; GFX11-LABEL: v_fadd_bf16_fpimm_0:
12615 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12616 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12617 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12618 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
12619 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
12620 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
12621 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12622 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12623 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12624 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12625 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12626 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12627 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12628 %add = fadd bfloat %arg0, 1.0
12632 define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
12633 ; GCN-LABEL: v_fadd_bf16_fpimm_1:
12635 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12636 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12637 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12638 ; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12639 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12640 ; GCN-NEXT: s_setpc_b64 s[30:31]
12642 ; GFX7-LABEL: v_fadd_bf16_fpimm_1:
12644 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12645 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12646 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12647 ; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12648 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12649 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12651 ; GFX8-LABEL: v_fadd_bf16_fpimm_1:
12653 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12654 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12655 ; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12656 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12657 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12658 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12659 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
12660 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12661 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12662 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12663 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12665 ; GFX9-LABEL: v_fadd_bf16_fpimm_1:
12667 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12668 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12669 ; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12670 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12671 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12672 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12673 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
12674 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12675 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12676 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12677 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12679 ; GFX10-LABEL: v_fadd_bf16_fpimm_1:
12681 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12682 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12683 ; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12684 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
12685 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
12686 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12687 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12688 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12689 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12690 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12692 ; GFX11-LABEL: v_fadd_bf16_fpimm_1:
12694 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12695 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12696 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12697 ; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12698 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
12699 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
12700 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12701 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12702 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12703 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12704 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12705 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12706 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12707 %add = fadd bfloat %arg0, 42.0
12711 define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
12712 ; GCN-LABEL: v_fsub_bf16:
12714 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12715 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12716 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
12717 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12718 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12719 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
12720 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12721 ; GCN-NEXT: s_setpc_b64 s[30:31]
12723 ; GFX7-LABEL: v_fsub_bf16:
12725 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12726 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12727 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
12728 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12729 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12730 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
12731 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12732 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12734 ; GFX8-LABEL: v_fsub_bf16:
12736 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12737 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12738 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12739 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
12740 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12741 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12742 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12743 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
12744 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12745 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12746 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12747 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12749 ; GFX9-LABEL: v_fsub_bf16:
12751 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12752 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12753 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12754 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
12755 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12756 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12757 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12758 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
12759 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12760 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12761 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12762 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12764 ; GFX10-LABEL: v_fsub_bf16:
12766 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12767 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12768 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12769 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
12770 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
12771 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
12772 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12773 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12774 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12775 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12776 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12778 ; GFX11-LABEL: v_fsub_bf16:
12780 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12781 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12782 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12783 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12784 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
12785 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
12786 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
12787 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12788 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12789 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12790 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12791 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12792 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12793 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12794 %op = fsub bfloat %a, %b
12798 define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
12799 ; GCN-LABEL: v_fsub_v2bf16:
12801 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12802 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12803 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
12804 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
12805 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
12806 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12807 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12808 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12809 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12810 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v3
12811 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v2
12812 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12813 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12814 ; GCN-NEXT: s_setpc_b64 s[30:31]
12816 ; GFX7-LABEL: v_fsub_v2bf16:
12818 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12819 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12820 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
12821 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
12822 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
12823 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12824 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12825 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12826 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12827 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
12828 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2
12829 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12830 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12831 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12833 ; GFX8-LABEL: v_fsub_v2bf16:
12835 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12836 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12837 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12838 ; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
12839 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
12840 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
12841 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12842 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12843 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
12844 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
12845 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
12846 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
12847 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12848 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
12849 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12850 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12851 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
12852 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12853 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
12854 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12855 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
12856 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12858 ; GFX9-LABEL: v_fsub_v2bf16:
12860 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12861 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12862 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12863 ; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
12864 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12865 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12866 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
12867 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12868 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
12869 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
12870 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
12871 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
12872 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12873 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
12874 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12875 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
12876 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12877 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
12878 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
12879 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
12880 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12882 ; GFX10-LABEL: v_fsub_v2bf16:
12884 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12885 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12886 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12887 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12888 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12889 ; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2
12890 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
12891 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
12892 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
12893 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
12894 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12895 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
12896 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
12897 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
12898 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
12899 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12900 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
12901 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
12902 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12904 ; GFX11-LABEL: v_fsub_v2bf16:
12906 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12907 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12908 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12909 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12910 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12911 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
12912 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
12913 ; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2
12914 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12915 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
12916 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
12917 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
12918 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12919 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
12920 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
12921 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
12922 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12923 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
12924 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12925 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
12926 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12927 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
12928 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12929 %op = fsub <2 x bfloat> %a, %b
12930 ret <2 x bfloat> %op
12933 define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
12934 ; GCN-LABEL: v_fsub_v3bf16:
12936 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12937 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12938 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
12939 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
12940 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
12941 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
12942 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
12943 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12944 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12945 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12946 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12947 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12948 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12949 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
12950 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
12951 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v3
12952 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12953 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12954 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12955 ; GCN-NEXT: s_setpc_b64 s[30:31]
12957 ; GFX7-LABEL: v_fsub_v3bf16:
12959 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12960 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12961 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
12962 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
12963 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
12964 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
12965 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
12966 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12967 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12968 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12969 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12970 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12971 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12972 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
12973 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v4
12974 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3
12975 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12976 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12977 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12978 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12980 ; GFX8-LABEL: v_fsub_v3bf16:
12982 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12983 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
12984 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12985 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
12986 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
12987 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
12988 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
12989 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
12990 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
12991 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
12992 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
12993 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
12994 ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
12995 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
12996 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
12997 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
12998 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12999 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13000 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
13001 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
13002 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
13003 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13004 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13005 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13006 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13007 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13008 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
13009 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13010 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13011 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13012 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13013 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13014 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13016 ; GFX9-LABEL: v_fsub_v3bf16:
13018 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13019 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13020 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13021 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
13022 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13023 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13024 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13025 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
13026 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13027 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
13028 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13029 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
13030 ; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3
13031 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13032 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13033 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
13034 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
13035 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
13036 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
13037 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13038 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13039 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13040 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13041 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
13042 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13043 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13044 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13045 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13046 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
13047 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13049 ; GFX10-LABEL: v_fsub_v3bf16:
13051 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13052 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13053 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13054 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13055 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13056 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13057 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13058 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
13059 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
13060 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
13061 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
13062 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13063 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
13064 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13065 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
13066 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13067 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13068 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13069 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
13070 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13071 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13072 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13073 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13074 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13075 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13076 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13077 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
13078 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13080 ; GFX11TRUE16-LABEL: v_fsub_v3bf16:
13081 ; GFX11TRUE16: ; %bb.0:
13082 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13083 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13084 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13085 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13086 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13087 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13088 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13089 ; GFX11TRUE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13090 ; GFX11TRUE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
13091 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13092 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13093 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13094 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13095 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13096 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13097 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13098 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13099 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13100 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13101 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13102 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13103 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13104 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13105 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13106 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13107 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13108 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13109 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13110 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
13111 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
13113 ; GFX11FAKE16-LABEL: v_fsub_v3bf16:
13114 ; GFX11FAKE16: ; %bb.0:
13115 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13116 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13117 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13118 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13119 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13120 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13121 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13122 ; GFX11FAKE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13123 ; GFX11FAKE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
13124 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13125 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13126 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13127 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13128 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13129 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13130 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13131 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13132 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13133 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13134 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13135 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13136 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13137 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13138 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13139 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13140 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13141 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13142 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13143 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
13144 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
13145 %op = fsub <3 x bfloat> %a, %b
13146 ret <3 x bfloat> %op
13149 define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
13150 ; GCN-LABEL: v_fsub_v4bf16:
13152 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13153 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13154 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
13155 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13156 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
13157 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13158 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
13159 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13160 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
13161 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13162 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13163 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13164 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13165 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13166 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13167 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13168 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13169 ; GCN-NEXT: v_sub_f32_e32 v3, v3, v7
13170 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v6
13171 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v5
13172 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v4
13173 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13174 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13175 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13176 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13177 ; GCN-NEXT: s_setpc_b64 s[30:31]
13179 ; GFX7-LABEL: v_fsub_v4bf16:
13181 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13182 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13183 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
13184 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13185 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
13186 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13187 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
13188 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13189 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
13190 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13191 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13192 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13193 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13194 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13195 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13196 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13197 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13198 ; GFX7-NEXT: v_sub_f32_e32 v3, v3, v7
13199 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
13200 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v5
13201 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v4
13202 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13203 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13204 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13205 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13206 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13208 ; GFX8-LABEL: v_fsub_v4bf16:
13210 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13211 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13212 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13213 ; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4
13214 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
13215 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
13216 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13217 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13218 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
13219 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
13220 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
13221 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13222 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
13223 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
13224 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13225 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
13226 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
13227 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
13228 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13229 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13230 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13231 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13232 ; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
13233 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
13234 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
13235 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13236 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13237 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
13238 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
13239 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
13240 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13241 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13242 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13243 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13244 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13245 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
13246 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13247 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13248 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13249 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13250 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13251 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
13252 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13254 ; GFX9-LABEL: v_fsub_v4bf16:
13256 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13257 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13258 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13259 ; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
13260 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13261 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13262 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
13263 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13264 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
13265 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
13266 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
13267 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13268 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13269 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13270 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13271 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
13272 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13273 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13274 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13275 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13276 ; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
13277 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13278 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13279 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
13280 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
13281 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
13282 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
13283 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13284 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13285 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13286 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13287 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
13288 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13289 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13290 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13291 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13292 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
13293 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13295 ; GFX10-LABEL: v_fsub_v4bf16:
13297 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13298 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13299 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13300 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13301 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13302 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
13303 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
13304 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
13305 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13306 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13307 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
13308 ; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6
13309 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
13310 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13311 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
13312 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13313 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
13314 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
13315 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
13316 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
13317 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
13318 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
13319 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13320 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
13321 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
13322 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
13323 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13324 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13325 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13326 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13327 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13328 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13329 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
13330 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
13331 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
13332 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13334 ; GFX11-LABEL: v_fsub_v4bf16:
13336 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13337 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
13338 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
13339 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13340 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13341 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13342 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13344 ; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
13345 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13346 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
13347 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13348 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3
13349 ; GFX11-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
13350 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
13351 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
13352 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
13353 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
13354 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
13355 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13356 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13357 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
13358 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
13359 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
13360 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
13361 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13362 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
13363 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
13364 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
13365 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
13366 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13367 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13368 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13369 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13370 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13371 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
13372 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
13373 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
13374 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
13375 ; GFX11-NEXT: s_setpc_b64 s[30:31]
13376 %op = fsub <4 x bfloat> %a, %b
13377 ret <4 x bfloat> %op
13380 define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
13381 ; GCN-LABEL: v_fmul_bf16:
13383 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13384 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13385 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13386 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13387 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13388 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
13389 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13390 ; GCN-NEXT: s_setpc_b64 s[30:31]
13392 ; GFX7-LABEL: v_fmul_bf16:
13394 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13395 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13396 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13397 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13398 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13399 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
13400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13401 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13403 ; GFX8-LABEL: v_fmul_bf16:
13405 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13406 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13407 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13408 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
13409 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
13410 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
13411 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
13412 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
13413 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13414 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
13415 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13416 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13418 ; GFX9-LABEL: v_fmul_bf16:
13420 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13421 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13422 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13423 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
13424 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
13425 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13426 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
13427 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
13428 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13429 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
13430 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13431 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13433 ; GFX10-LABEL: v_fmul_bf16:
13435 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13436 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13437 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13438 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
13439 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
13440 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
13441 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13442 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
13443 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
13444 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13445 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13447 ; GFX11-LABEL: v_fmul_bf16:
13449 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13450 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13451 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13452 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13453 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
13454 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
13455 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
13456 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13457 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
13458 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
13459 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
13460 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
13461 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13462 ; GFX11-NEXT: s_setpc_b64 s[30:31]
13463 %op = fmul bfloat %a, %b
13467 define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
13468 ; GCN-LABEL: v_fmul_v2bf16:
13470 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13471 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13472 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13473 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13474 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13475 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13476 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13477 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13478 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13479 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
13480 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
13481 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13482 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13483 ; GCN-NEXT: s_setpc_b64 s[30:31]
13485 ; GFX7-LABEL: v_fmul_v2bf16:
13487 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13488 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13489 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13490 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13491 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13492 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13493 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13494 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13495 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13496 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
13497 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
13498 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13499 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13500 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13502 ; GFX8-LABEL: v_fmul_v2bf16:
13504 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13505 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13506 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13507 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
13508 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
13509 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
13510 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13511 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13512 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
13513 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
13514 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
13515 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
13516 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
13517 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
13518 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
13519 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
13520 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
13521 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13522 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
13523 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13524 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
13525 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13527 ; GFX9-LABEL: v_fmul_v2bf16:
13529 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13530 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13531 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13532 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
13533 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13534 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13535 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
13536 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13537 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
13538 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
13539 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
13540 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
13541 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
13542 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
13543 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
13544 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
13545 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13546 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
13547 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13548 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
13549 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13551 ; GFX10-LABEL: v_fmul_v2bf16:
13553 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13554 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13555 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13556 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13557 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13558 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
13559 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
13560 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
13561 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
13562 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
13563 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
13564 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
13565 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
13566 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
13567 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
13568 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13569 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
13570 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
13571 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13573 ; GFX11-LABEL: v_fmul_v2bf16:
13575 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13576 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13577 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13578 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13579 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13580 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
13581 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
13582 ; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
13583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13584 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
13585 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
13586 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
13587 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
13588 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
13589 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
13590 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
13591 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
13592 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
13593 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13594 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
13595 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
13596 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
13597 ; GFX11-NEXT: s_setpc_b64 s[30:31]
13598 %op = fmul <2 x bfloat> %a, %b
13599 ret <2 x bfloat> %op
13602 define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
13603 ; GCN-LABEL: v_fmul_v3bf16:
13605 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13606 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13607 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13608 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13609 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
13610 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13611 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
13612 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13613 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13614 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13615 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13616 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13617 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13618 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
13619 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
13620 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
13621 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13622 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13623 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13624 ; GCN-NEXT: s_setpc_b64 s[30:31]
13626 ; GFX7-LABEL: v_fmul_v3bf16:
13628 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13629 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13630 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13631 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13632 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
13633 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13634 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
13635 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13636 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13637 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13638 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13639 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13640 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13641 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
13642 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
13643 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
13644 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13645 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13646 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13647 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13649 ; GFX8-LABEL: v_fmul_v3bf16:
13651 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13652 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13653 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13654 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
13655 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
13656 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
13657 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
13658 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
13659 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13660 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
13661 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13662 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
13663 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
13664 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
13665 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
13666 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
13667 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13668 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13669 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
13670 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
13671 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
13672 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13673 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13674 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13675 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13676 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13677 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
13678 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13679 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13680 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13681 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13682 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13683 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13685 ; GFX9-LABEL: v_fmul_v3bf16:
13687 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13688 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13689 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13690 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
13691 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13692 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13693 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13694 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
13695 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13696 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
13697 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13698 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
13699 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
13700 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13701 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13702 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
13703 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
13704 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
13705 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
13706 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13707 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13708 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13709 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13710 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
13711 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13712 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13713 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13714 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13715 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
13716 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13718 ; GFX10-LABEL: v_fmul_v3bf16:
13720 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13721 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13722 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13723 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13724 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13725 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13726 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13727 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
13728 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
13729 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
13730 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
13731 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13732 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
13733 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13734 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
13735 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13736 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13737 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13738 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
13739 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13740 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13741 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13742 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13743 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13744 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13745 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13746 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
13747 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13749 ; GFX11TRUE16-LABEL: v_fmul_v3bf16:
13750 ; GFX11TRUE16: ; %bb.0:
13751 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13752 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13753 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13754 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13755 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13756 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13757 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13758 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13759 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
13760 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13761 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13762 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13763 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13764 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13765 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13766 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13767 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13768 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13769 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13770 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13771 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13772 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13773 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13774 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13775 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13776 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13777 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13778 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13779 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
13780 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
13782 ; GFX11FAKE16-LABEL: v_fmul_v3bf16:
13783 ; GFX11FAKE16: ; %bb.0:
13784 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13785 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13786 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13787 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13788 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13789 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13790 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13791 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13792 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
13793 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13794 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13795 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13796 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13797 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13798 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13799 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13800 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13801 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13802 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13803 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13804 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13805 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13806 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13807 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13808 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13809 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13810 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13811 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13812 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
13813 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
13814 %op = fmul <3 x bfloat> %a, %b
13815 ret <3 x bfloat> %op
13818 define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
13819 ; GCN-LABEL: v_fmul_v4bf16:
13821 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13822 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13823 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
13824 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13825 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
13826 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13827 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
13828 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13829 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
13830 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13831 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13832 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13833 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13834 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13835 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13836 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13837 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13838 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
13839 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
13840 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
13841 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
13842 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13843 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13844 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13845 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13846 ; GCN-NEXT: s_setpc_b64 s[30:31]
13848 ; GFX7-LABEL: v_fmul_v4bf16:
13850 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13851 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13852 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
13853 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13854 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
13855 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13856 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
13857 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13858 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
13859 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13860 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13861 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13862 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13863 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13864 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13865 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13866 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13867 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
13868 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
13869 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
13870 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
13871 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13872 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13873 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13874 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13875 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13877 ; GFX8-LABEL: v_fmul_v4bf16:
13879 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13880 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13881 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13882 ; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4
13883 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
13884 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
13885 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13886 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13887 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
13888 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
13889 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
13890 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13891 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
13892 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
13893 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13894 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
13895 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
13896 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
13897 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13898 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13899 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13900 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13901 ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
13902 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
13903 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
13904 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13905 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13906 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
13907 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
13908 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
13909 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13910 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13911 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13912 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13913 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13914 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
13915 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13916 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13917 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13918 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13919 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13920 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
13921 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13923 ; GFX9-LABEL: v_fmul_v4bf16:
13925 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13926 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13927 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13928 ; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
13929 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13930 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13931 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
13932 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13933 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
13934 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
13935 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
13936 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13937 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13938 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13939 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13940 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
13941 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13942 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13943 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13944 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13945 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
13946 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13947 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13948 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
13949 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
13950 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
13951 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
13952 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13953 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13954 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13955 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13956 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
13957 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13958 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13959 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13960 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13961 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
13962 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13964 ; GFX10-LABEL: v_fmul_v4bf16:
13966 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13967 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13968 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13969 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13970 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13971 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
13972 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
13973 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
13974 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13975 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13976 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
13977 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
13978 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
13979 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13980 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
13981 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13982 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
13983 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
13984 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
13985 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
13986 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
13987 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
13988 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13989 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
13990 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
13991 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
13992 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13993 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13994 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13995 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13996 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13997 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13998 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
13999 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
14000 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14001 ; GFX10-NEXT: s_setpc_b64 s[30:31]
14003 ; GFX11-LABEL: v_fmul_v4bf16:
14005 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14006 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
14007 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
14008 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14009 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14010 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
14011 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
14012 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14013 ; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
14014 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14015 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
14016 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14017 ; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
14018 ; GFX11-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
14019 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
14020 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
14021 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
14022 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
14023 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
14024 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
14025 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
14026 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
14027 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
14028 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
14029 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
14030 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
14031 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
14032 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
14033 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
14034 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
14035 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
14036 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
14037 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14038 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
14039 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14040 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
14041 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
14042 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
14043 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14044 ; GFX11-NEXT: s_setpc_b64 s[30:31]
14045 %op = fmul <4 x bfloat> %a, %b
14046 ret <4 x bfloat> %op
14049 define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
14050 ; GCN-LABEL: v_fmul_v8bf16:
14052 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14053 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
14054 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
14055 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
14056 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
14057 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
14058 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
14059 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
14060 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
14061 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
14062 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
14063 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
14064 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
14065 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
14066 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
14067 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
14068 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
14069 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14070 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14071 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14072 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14073 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14074 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14075 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14076 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14077 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14078 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14079 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14080 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14081 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14082 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14083 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14084 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14085 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
14086 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
14087 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
14088 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
14089 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
14090 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
14091 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
14092 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
14093 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14094 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14095 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14096 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14097 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14098 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14099 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14100 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14101 ; GCN-NEXT: s_setpc_b64 s[30:31]
14103 ; GFX7-LABEL: v_fmul_v8bf16:
14105 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14106 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
14107 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
14108 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
14109 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
14110 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
14111 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
14112 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
14113 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
14114 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
14115 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
14116 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
14117 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
14118 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
14119 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
14120 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
14121 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
14122 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14123 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14124 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14125 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14126 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14127 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14128 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14129 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14130 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14131 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14132 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14133 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14134 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14135 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14136 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14137 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14138 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15
14139 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14
14140 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13
14141 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12
14142 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11
14143 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10
14144 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
14145 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
14146 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14147 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14148 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14149 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14150 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14151 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14152 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14153 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14154 ; GFX7-NEXT: s_setpc_b64 s[30:31]
14156 ; GFX8-LABEL: v_fmul_v8bf16:
14158 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14159 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14160 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14161 ; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
14162 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
14163 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
14164 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14165 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14166 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
14167 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7
14168 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
14169 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
14170 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
14171 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
14172 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
14173 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
14174 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
14175 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
14176 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14177 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
14178 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
14179 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
14180 ; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
14181 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
14182 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
14183 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14184 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14185 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14186 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
14187 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
14188 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14189 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
14190 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
14191 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
14192 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
14193 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
14194 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14195 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
14196 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14197 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
14198 ; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
14199 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
14200 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
14201 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14202 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14203 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14204 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
14205 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
14206 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14207 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
14208 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
14209 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
14210 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
14211 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
14212 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14213 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
14214 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
14215 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
14216 ; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
14217 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
14218 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
14219 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14220 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14221 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14222 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4
14223 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
14224 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14225 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
14226 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
14227 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
14228 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
14229 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
14230 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14231 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
14232 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
14233 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
14234 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
14235 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
14236 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
14237 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
14238 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
14239 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
14240 ; GFX8-NEXT: s_setpc_b64 s[30:31]
14242 ; GFX9-LABEL: v_fmul_v8bf16:
14244 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14245 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14246 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14247 ; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
14248 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14249 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14250 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
14251 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
14252 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
14253 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
14254 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
14255 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
14256 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
14257 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
14258 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
14259 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
14260 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14261 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
14262 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
14263 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
14264 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
14265 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14266 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14267 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
14268 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
14269 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
14270 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
14271 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14272 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
14273 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
14274 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
14275 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
14276 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14277 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
14278 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14279 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
14280 ; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
14281 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14282 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14283 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
14284 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
14285 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
14286 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
14287 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14288 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
14289 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
14290 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
14291 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
14292 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14293 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
14294 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
14295 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
14296 ; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
14297 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14298 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14299 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
14300 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
14301 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
14302 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
14303 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14304 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
14305 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
14306 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
14307 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
14308 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14309 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
14310 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
14311 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
14312 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
14313 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
14314 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
14315 ; GFX9-NEXT: s_setpc_b64 s[30:31]
14317 ; GFX10-LABEL: v_fmul_v8bf16:
14319 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14320 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14321 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14322 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14323 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14324 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
14325 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14326 ; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8
14327 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
14328 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14329 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
14330 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
14331 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
14332 ; GFX10-NEXT: v_mul_f32_e32 v7, v10, v9
14333 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
14334 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
14335 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
14336 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
14337 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
14338 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
14339 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14340 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
14341 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
14342 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
14343 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
14344 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
14345 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
14346 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
14347 ; GFX10-NEXT: v_mul_f32_e32 v6, v10, v6
14348 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
14349 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14350 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14351 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
14352 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
14353 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
14354 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
14355 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14356 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14357 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
14358 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
14359 ; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13
14360 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
14361 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
14362 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
14363 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
14364 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
14365 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
14366 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
14367 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
14368 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
14369 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
14370 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
14371 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
14372 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
14373 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
14374 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
14375 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
14376 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
14377 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
14378 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
14379 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
14380 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
14381 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14382 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
14383 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
14384 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
14385 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14386 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
14387 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
14388 ; GFX10-NEXT: s_setpc_b64 s[30:31]
14390 ; GFX11-LABEL: v_fmul_v8bf16:
14392 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14393 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
14394 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14395 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14396 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14397 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
14398 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14399 ; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
14400 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
14401 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
14402 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14403 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
14404 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14405 ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7
14406 ; GFX11-NEXT: v_mul_f32_e32 v7, v10, v9
14407 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
14408 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
14409 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14410 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
14411 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
14412 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
14413 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
14414 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
14415 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
14416 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
14417 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
14418 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
14419 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14420 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
14421 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14422 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
14423 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
14424 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14425 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14426 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
14427 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14428 ; GFX11-NEXT: v_mul_f32_e32 v6, v10, v6
14429 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
14430 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
14431 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
14432 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
14433 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
14434 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
14435 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
14436 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
14437 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
14438 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14439 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14440 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
14441 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
14442 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
14443 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
14444 ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
14445 ; GFX11-NEXT: v_mul_f32_e32 v5, v15, v13
14446 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14447 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
14448 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
14449 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
14450 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
14451 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14452 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
14453 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
14454 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
14455 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
14456 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
14457 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
14458 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
14459 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
14460 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
14461 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
14462 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14463 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
14464 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
14465 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
14466 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
14467 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14468 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
14469 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
14470 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
14471 ; GFX11-NEXT: s_setpc_b64 s[30:31]
14472 %op = fmul <8 x bfloat> %a, %b
14473 ret <8 x bfloat> %op
14476 define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
14477 ; GCN-LABEL: v_fmul_v16bf16:
14479 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14480 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
14481 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
14482 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
14483 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14484 ; GCN-NEXT: v_mul_f32_e32 v14, v14, v30
14485 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
14486 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
14487 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
14488 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14489 ; GCN-NEXT: v_mul_f32_e32 v13, v13, v29
14490 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
14491 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
14492 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
14493 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14494 ; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
14495 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
14496 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
14497 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
14498 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
14499 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
14500 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
14501 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
14502 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
14503 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
14504 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
14505 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
14506 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
14507 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
14508 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
14509 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
14510 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
14511 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
14512 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
14513 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
14514 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
14515 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
14516 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
14517 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
14518 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
14519 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
14520 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
14521 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14522 ; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
14523 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
14524 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
14525 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14526 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
14527 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14528 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
14529 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14530 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
14531 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14532 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
14533 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14534 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
14535 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14536 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
14537 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14538 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
14539 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14540 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
14541 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14542 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
14543 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14544 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
14545 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14546 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14547 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
14548 ; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
14549 ; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
14550 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
14551 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
14552 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
14553 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
14554 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
14555 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
14556 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
14557 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
14558 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14559 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14560 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14561 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14562 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14563 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14564 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14565 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14566 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14567 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14568 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14569 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14570 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14571 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14572 ; GCN-NEXT: s_waitcnt vmcnt(0)
14573 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
14574 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
14575 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
14576 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14577 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14578 ; GCN-NEXT: s_setpc_b64 s[30:31]
14580 ; GFX7-LABEL: v_fmul_v16bf16:
14582 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14583 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
14584 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
14585 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
14586 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14587 ; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
14588 ; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
14589 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
14590 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
14591 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
14592 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14593 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
14594 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
14595 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
14596 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
14597 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
14598 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
14599 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
14600 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
14601 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
14602 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
14603 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
14604 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
14605 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
14606 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
14607 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
14608 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
14609 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
14610 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
14611 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
14612 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
14613 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
14614 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
14615 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
14616 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
14617 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
14618 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
14619 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
14620 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
14621 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
14622 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14623 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
14624 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14625 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
14626 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14627 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
14628 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14629 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
14630 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14631 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
14632 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14633 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
14634 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14635 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14636 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
14637 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14638 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
14639 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14640 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
14641 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14642 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
14643 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14644 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
14645 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14646 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
14647 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14648 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
14649 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
14650 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
14651 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
14652 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
14653 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
14654 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23
14655 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21
14656 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20
14657 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19
14658 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18
14659 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17
14660 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
14661 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14662 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14663 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14664 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14665 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14666 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14667 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14668 ; GFX7-NEXT: s_waitcnt vmcnt(0)
14669 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
14670 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
14671 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
14672 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14673 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14674 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14675 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14676 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14677 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14678 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14679 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14680 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14681 ; GFX7-NEXT: s_setpc_b64 s[30:31]
14683 ; GFX8-LABEL: v_fmul_v16bf16:
14685 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14686 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
14687 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
14688 ; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
14689 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
14690 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
14691 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
14692 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14693 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14694 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14695 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15
14696 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
14697 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
14698 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
14699 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
14700 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
14701 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
14702 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
14703 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14704 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
14705 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
14706 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
14707 ; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15
14708 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
14709 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
14710 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14711 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14712 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14713 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14
14714 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
14715 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
14716 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
14717 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
14718 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
14719 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
14720 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
14721 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14722 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
14723 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
14724 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
14725 ; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14
14726 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
14727 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
14728 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14729 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14730 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14731 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13
14732 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
14733 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
14734 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
14735 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
14736 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
14737 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
14738 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
14739 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14740 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
14741 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
14742 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
14743 ; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13
14744 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
14745 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
14746 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14747 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14748 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14749 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12
14750 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
14751 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
14752 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
14753 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
14754 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
14755 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
14756 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
14757 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
14758 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
14759 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
14760 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
14761 ; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12
14762 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
14763 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
14764 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14765 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14766 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14767 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11
14768 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
14769 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
14770 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
14771 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
14772 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
14773 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
14774 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
14775 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14776 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
14777 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
14778 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
14779 ; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11
14780 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
14781 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
14782 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14783 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14784 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14785 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10
14786 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
14787 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
14788 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
14789 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
14790 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
14791 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
14792 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
14793 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14794 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
14795 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
14796 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
14797 ; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10
14798 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
14799 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
14800 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14801 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14802 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14803 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9
14804 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
14805 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
14806 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
14807 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
14808 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
14809 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14810 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
14811 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14812 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
14813 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
14814 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
14815 ; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9
14816 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
14817 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
14818 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14819 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14820 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14821 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8
14822 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
14823 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
14824 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
14825 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
14826 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
14827 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
14828 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
14829 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14830 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
14831 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
14832 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
14833 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
14834 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
14835 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
14836 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
14837 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
14838 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
14839 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
14840 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
14841 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
14842 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
14843 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
14844 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
14845 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
14846 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
14847 ; GFX8-NEXT: s_setpc_b64 s[30:31]
14849 ; GFX9-LABEL: v_fmul_v16bf16:
14851 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14852 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
14853 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
14854 ; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
14855 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14856 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14857 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
14858 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
14859 ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
14860 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
14861 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
14862 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
14863 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
14864 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
14865 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
14866 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
14867 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14868 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
14869 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
14870 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
14871 ; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
14872 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14873 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14874 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
14875 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
14876 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
14877 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
14878 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
14879 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
14880 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
14881 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
14882 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
14883 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14884 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
14885 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
14886 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
14887 ; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
14888 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14889 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14890 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
14891 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
14892 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
14893 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
14894 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
14895 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
14896 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
14897 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
14898 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
14899 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14900 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
14901 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
14902 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
14903 ; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
14904 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14905 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14906 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
14907 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
14908 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
14909 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
14910 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
14911 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
14912 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
14913 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
14914 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
14915 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
14916 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
14917 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
14918 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
14919 ; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
14920 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14921 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14922 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
14923 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
14924 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
14925 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
14926 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
14927 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
14928 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
14929 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
14930 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
14931 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14932 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
14933 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
14934 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
14935 ; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
14936 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14937 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14938 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
14939 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
14940 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
14941 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
14942 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
14943 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
14944 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
14945 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
14946 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
14947 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14948 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
14949 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
14950 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
14951 ; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
14952 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14953 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14954 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
14955 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
14956 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
14957 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
14958 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
14959 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
14960 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
14961 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
14962 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
14963 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14964 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
14965 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
14966 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
14967 ; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
14968 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14969 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14970 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
14971 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
14972 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
14973 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
14974 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
14975 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
14976 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
14977 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
14978 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
14979 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14980 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
14981 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
14982 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
14983 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
14984 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
14985 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
14986 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
14987 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
14988 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
14989 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
14990 ; GFX9-NEXT: s_setpc_b64 s[30:31]
14992 ; GFX10-LABEL: v_fmul_v16bf16:
14994 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14995 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
14996 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
14997 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14998 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14999 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
15000 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15001 ; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16
15002 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
15003 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
15004 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15005 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
15006 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
15007 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
15008 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
15009 ; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17
15010 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
15011 ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
15012 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
15013 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
15014 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
15015 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
15016 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
15017 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
15018 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
15019 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
15020 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15021 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
15022 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
15023 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
15024 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15025 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15026 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
15027 ; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19
15028 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
15029 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13
15030 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
15031 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
15032 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
15033 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
15034 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
15035 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
15036 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
15037 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15038 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15039 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
15040 ; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18
15041 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
15042 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
15043 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15044 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
15045 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
15046 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
15047 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
15048 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
15049 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
15050 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
15051 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
15052 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
15053 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15054 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
15055 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
15056 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15057 ; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12
15058 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
15059 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
15060 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
15061 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
15062 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
15063 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
15064 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
15065 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
15066 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
15067 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
15068 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15069 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
15070 ; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18
15071 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15072 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
15073 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
15074 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
15075 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
15076 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
15077 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
15078 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
15079 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
15080 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
15081 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
15082 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15083 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
15084 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
15085 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
15086 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
15087 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15088 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
15089 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
15090 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
15091 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
15092 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
15093 ; GFX10-NEXT: v_mul_f32_e32 v19, v22, v20
15094 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
15095 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
15096 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15097 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15098 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
15099 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
15100 ; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20
15101 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
15102 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
15103 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
15104 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
15105 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
15106 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
15107 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
15108 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
15109 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
15110 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
15111 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
15112 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
15113 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
15114 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
15115 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
15116 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
15117 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
15118 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
15119 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
15120 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
15121 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
15122 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
15123 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
15124 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
15125 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
15126 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
15127 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
15128 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
15129 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
15130 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
15131 ; GFX10-NEXT: s_setpc_b64 s[30:31]
15133 ; GFX11-LABEL: v_fmul_v16bf16:
15135 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15136 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
15137 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
15138 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15139 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
15140 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
15141 ; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
15142 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
15143 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15144 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
15145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
15146 ; GFX11-NEXT: v_mul_f32_e32 v17, v18, v17
15147 ; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14
15148 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
15149 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15150 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
15151 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
15152 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15153 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
15154 ; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15
15155 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
15156 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
15157 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
15158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15159 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
15160 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
15161 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
15162 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
15163 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
15164 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
15165 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
15166 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
15167 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
15168 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15169 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
15170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
15171 ; GFX11-NEXT: v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
15172 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
15173 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
15174 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
15175 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15176 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15177 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15178 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
15179 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
15180 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
15181 ; GFX11-NEXT: v_mul_f32_e32 v4, v4, v12
15182 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
15183 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15184 ; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13
15185 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
15186 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
15187 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
15188 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
15189 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
15190 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15191 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
15192 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
15193 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
15194 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
15195 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
15196 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
15197 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
15198 ; GFX11-NEXT: v_mul_f32_e32 v12, v18, v12
15199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
15200 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
15201 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
15202 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
15203 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
15204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15205 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
15206 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
15207 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
15208 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
15209 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
15210 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
15211 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
15212 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
15213 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
15214 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
15215 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
15216 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15217 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15218 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
15219 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
15220 ; GFX11-NEXT: v_mul_f32_e32 v18, v19, v18
15221 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
15222 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
15223 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15224 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15225 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
15226 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
15227 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15228 ; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
15229 ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11
15230 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
15231 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
15232 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
15233 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
15234 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
15235 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
15236 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
15237 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
15238 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
15239 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
15240 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
15241 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
15242 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
15243 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
15244 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
15245 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
15246 ; GFX11-NEXT: v_mul_f32_e32 v19, v22, v20
15247 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
15248 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
15249 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15250 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
15251 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
15252 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15253 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15254 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
15255 ; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9
15256 ; GFX11-NEXT: v_mul_f32_e32 v9, v22, v20
15257 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
15258 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
15259 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
15260 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
15261 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
15262 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
15263 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
15264 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
15265 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
15266 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
15267 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
15268 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
15269 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
15270 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
15271 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
15272 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
15273 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
15274 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
15275 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
15276 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
15277 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
15278 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
15279 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
15280 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15281 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
15282 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
15283 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
15284 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
15285 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
15286 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
15287 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
15288 ; GFX11-NEXT: s_setpc_b64 s[30:31]
15289 %op = fmul <16 x bfloat> %a, %b
15290 ret <16 x bfloat> %op
15293 define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
15294 ; GCN-LABEL: v_fmul_v32bf16:
15296 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15297 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
15298 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
15299 ; GCN-NEXT: s_waitcnt vmcnt(1)
15300 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
15301 ; GCN-NEXT: s_waitcnt vmcnt(0)
15302 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
15303 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15304 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15305 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
15306 ; GCN-NEXT: v_mul_f32_e32 v31, v31, v32
15307 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
15308 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15309 ; GCN-NEXT: s_waitcnt vmcnt(0)
15310 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15311 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15312 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
15313 ; GCN-NEXT: v_mul_f32_e32 v30, v30, v32
15314 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
15315 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15316 ; GCN-NEXT: s_waitcnt vmcnt(0)
15317 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15318 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15319 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
15320 ; GCN-NEXT: v_mul_f32_e32 v29, v29, v32
15321 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
15322 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15323 ; GCN-NEXT: s_waitcnt vmcnt(0)
15324 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15325 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15326 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
15327 ; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
15328 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
15329 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15330 ; GCN-NEXT: s_waitcnt vmcnt(0)
15331 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15332 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15333 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
15334 ; GCN-NEXT: v_mul_f32_e32 v27, v27, v32
15335 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
15336 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15337 ; GCN-NEXT: s_waitcnt vmcnt(0)
15338 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15339 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15340 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
15341 ; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
15342 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
15343 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15344 ; GCN-NEXT: s_waitcnt vmcnt(0)
15345 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15346 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15347 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
15348 ; GCN-NEXT: v_mul_f32_e32 v25, v25, v32
15349 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
15350 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15351 ; GCN-NEXT: s_waitcnt vmcnt(0)
15352 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15353 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15354 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
15355 ; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
15356 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
15357 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15358 ; GCN-NEXT: s_waitcnt vmcnt(0)
15359 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15360 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15361 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
15362 ; GCN-NEXT: v_mul_f32_e32 v23, v23, v32
15363 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
15364 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15365 ; GCN-NEXT: s_waitcnt vmcnt(0)
15366 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15367 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15368 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
15369 ; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
15370 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
15371 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15372 ; GCN-NEXT: s_waitcnt vmcnt(0)
15373 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15374 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15375 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
15376 ; GCN-NEXT: v_mul_f32_e32 v21, v21, v32
15377 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
15378 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15379 ; GCN-NEXT: s_waitcnt vmcnt(0)
15380 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15381 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15382 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
15383 ; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
15384 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
15385 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15386 ; GCN-NEXT: s_waitcnt vmcnt(0)
15387 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15388 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15389 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
15390 ; GCN-NEXT: v_mul_f32_e32 v19, v19, v32
15391 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
15392 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15393 ; GCN-NEXT: s_waitcnt vmcnt(0)
15394 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15395 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15396 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
15397 ; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
15398 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
15399 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15400 ; GCN-NEXT: s_waitcnt vmcnt(0)
15401 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15402 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15403 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
15404 ; GCN-NEXT: v_mul_f32_e32 v17, v17, v32
15405 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
15406 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15407 ; GCN-NEXT: s_waitcnt vmcnt(0)
15408 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15409 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15410 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
15411 ; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
15412 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
15413 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15414 ; GCN-NEXT: s_waitcnt vmcnt(0)
15415 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15416 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15417 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
15418 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v32
15419 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
15420 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15421 ; GCN-NEXT: s_waitcnt vmcnt(0)
15422 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15423 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15424 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
15425 ; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
15426 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
15427 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15428 ; GCN-NEXT: s_waitcnt vmcnt(0)
15429 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15430 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15431 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
15432 ; GCN-NEXT: v_mul_f32_e32 v13, v13, v32
15433 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
15434 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15435 ; GCN-NEXT: s_waitcnt vmcnt(0)
15436 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15437 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15438 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
15439 ; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
15440 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
15441 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15442 ; GCN-NEXT: s_waitcnt vmcnt(0)
15443 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15444 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15445 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
15446 ; GCN-NEXT: v_mul_f32_e32 v11, v11, v32
15447 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
15448 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15449 ; GCN-NEXT: s_waitcnt vmcnt(0)
15450 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15451 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15452 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
15453 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
15454 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
15455 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15456 ; GCN-NEXT: s_waitcnt vmcnt(0)
15457 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15458 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15459 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
15460 ; GCN-NEXT: v_mul_f32_e32 v9, v9, v32
15461 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
15462 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15463 ; GCN-NEXT: s_waitcnt vmcnt(0)
15464 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15465 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15466 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
15467 ; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
15468 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
15469 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15470 ; GCN-NEXT: s_waitcnt vmcnt(0)
15471 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15472 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15473 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
15474 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v32
15475 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
15476 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15477 ; GCN-NEXT: s_waitcnt vmcnt(0)
15478 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15479 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15480 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
15481 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
15482 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
15483 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15484 ; GCN-NEXT: s_waitcnt vmcnt(0)
15485 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15486 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15487 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
15488 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v32
15489 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
15490 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15491 ; GCN-NEXT: s_waitcnt vmcnt(0)
15492 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15493 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15494 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
15495 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
15496 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
15497 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15498 ; GCN-NEXT: s_waitcnt vmcnt(0)
15499 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15500 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15501 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
15502 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v32
15503 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
15504 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15505 ; GCN-NEXT: s_waitcnt vmcnt(0)
15506 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15507 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15508 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
15509 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
15510 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
15511 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15512 ; GCN-NEXT: s_waitcnt vmcnt(0)
15513 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15514 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15515 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
15516 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v32
15517 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
15518 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15519 ; GCN-NEXT: s_waitcnt vmcnt(0)
15520 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15521 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15522 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v32
15523 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15524 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15525 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15526 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15527 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15528 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15529 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15530 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15531 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15532 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15533 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15534 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15535 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15536 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15537 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15538 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15539 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15540 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15541 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15542 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15543 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15544 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15545 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15546 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15547 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15548 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15549 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15550 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15551 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15552 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15553 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15554 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15555 ; GCN-NEXT: s_setpc_b64 s[30:31]
15557 ; GFX7-LABEL: v_fmul_v32bf16:
15559 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15560 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
15561 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
15562 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
15563 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15564 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
15565 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15566 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
15567 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15568 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
15569 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15570 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
15571 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15572 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
15573 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15574 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
15575 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15576 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
15577 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15578 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
15579 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15580 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
15581 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15582 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
15583 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15584 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
15585 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15586 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
15587 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15588 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
15589 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15590 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
15591 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15592 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
15593 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15594 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
15595 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15596 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
15597 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15598 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
15599 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15600 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
15601 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15602 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
15603 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15604 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
15605 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15606 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
15607 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15608 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
15609 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15610 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
15611 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15612 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
15613 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15614 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
15615 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15616 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
15617 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15618 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
15619 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15620 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
15621 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15622 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
15623 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15624 ; GFX7-NEXT: s_waitcnt vmcnt(1)
15625 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
15626 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15627 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15628 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15629 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15630 ; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32
15631 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
15632 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15633 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15634 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15635 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15636 ; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32
15637 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
15638 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15639 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15640 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15641 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15642 ; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32
15643 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
15644 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15645 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15646 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15647 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15648 ; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32
15649 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
15650 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15651 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15652 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15653 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15654 ; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32
15655 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
15656 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15657 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15658 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15659 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15660 ; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32
15661 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
15662 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15663 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15664 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15665 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15666 ; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32
15667 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
15668 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15669 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15670 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15671 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15672 ; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32
15673 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
15674 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15675 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15676 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15677 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15678 ; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32
15679 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
15680 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15681 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15682 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15683 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15684 ; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32
15685 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
15686 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15687 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15688 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15689 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15690 ; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32
15691 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
15692 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15693 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15694 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15695 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15696 ; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32
15697 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
15698 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15699 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15700 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15701 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15702 ; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32
15703 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
15704 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15705 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15706 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15707 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15708 ; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32
15709 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
15710 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15711 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15712 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15713 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15714 ; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32
15715 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
15716 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15717 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15718 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15719 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15720 ; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32
15721 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
15722 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15723 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15724 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15725 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15726 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32
15727 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
15728 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15729 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15730 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15731 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15732 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32
15733 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
15734 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15735 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15736 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15737 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15738 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32
15739 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
15740 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15741 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15742 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15743 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15744 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32
15745 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
15746 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15747 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15748 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15749 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15750 ; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32
15751 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
15752 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15753 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15754 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15755 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15756 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32
15757 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
15758 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15759 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15760 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15761 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15762 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32
15763 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
15764 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15765 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15766 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15767 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15768 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32
15769 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
15770 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15771 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15772 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15773 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15774 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32
15775 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
15776 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15777 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15778 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15779 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15780 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32
15781 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
15782 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15783 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15784 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15785 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15786 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32
15787 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
15788 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15789 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15790 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15791 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15792 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32
15793 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
15794 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15795 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15796 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15797 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15798 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32
15799 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
15800 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15801 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15802 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15803 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15804 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32
15805 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
15806 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15807 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15808 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15809 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15810 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32
15811 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
15812 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15813 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15814 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15815 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15816 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32
15817 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15818 ; GFX7-NEXT: s_setpc_b64 s[30:31]
15820 ; GFX8-LABEL: v_fmul_v32bf16:
15822 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15823 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
15824 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
15825 ; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31
15826 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
15827 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
15828 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
15829 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15830 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15831 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
15832 ; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30
15833 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
15834 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
15835 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
15836 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
15837 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
15838 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
15839 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
15840 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
15841 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
15842 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
15843 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
15844 ; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30
15845 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
15846 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
15847 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15848 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15849 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15850 ; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
15851 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
15852 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
15853 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
15854 ; GFX8-NEXT: s_waitcnt vmcnt(0)
15855 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
15856 ; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34
15857 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15858 ; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30
15859 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
15860 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
15861 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
15862 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
15863 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
15864 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
15865 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
15866 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
15867 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15868 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
15869 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
15870 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
15871 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
15872 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
15873 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15874 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
15875 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
15876 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
15877 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
15878 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
15879 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
15880 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
15881 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
15882 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
15883 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
15884 ; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29
15885 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
15886 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
15887 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15888 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15889 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15890 ; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28
15891 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
15892 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
15893 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
15894 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
15895 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
15896 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
15897 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
15898 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
15899 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
15900 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
15901 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
15902 ; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28
15903 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
15904 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
15905 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15906 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15907 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15908 ; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27
15909 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
15910 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
15911 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
15912 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
15913 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
15914 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
15915 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
15916 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
15917 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
15918 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
15919 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
15920 ; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27
15921 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
15922 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
15923 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15924 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15925 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15926 ; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26
15927 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
15928 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
15929 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
15930 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
15931 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
15932 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
15933 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
15934 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
15935 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
15936 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
15937 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
15938 ; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26
15939 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
15940 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
15941 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15942 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15943 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15944 ; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25
15945 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
15946 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
15947 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
15948 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
15949 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
15950 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
15951 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
15952 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
15953 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
15954 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
15955 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
15956 ; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25
15957 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
15958 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
15959 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15960 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15961 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15962 ; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24
15963 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
15964 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
15965 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
15966 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
15967 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
15968 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
15969 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
15970 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
15971 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
15972 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
15973 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
15974 ; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24
15975 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
15976 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
15977 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15978 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15979 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15980 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23
15981 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
15982 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
15983 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
15984 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
15985 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
15986 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
15987 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
15988 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
15989 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
15990 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
15991 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
15992 ; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23
15993 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
15994 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
15995 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15996 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15997 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15998 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22
15999 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
16000 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
16001 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
16002 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
16003 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
16004 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
16005 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
16006 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
16007 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
16008 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
16009 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
16010 ; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22
16011 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
16012 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
16013 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16014 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16015 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16016 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21
16017 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
16018 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
16019 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
16020 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
16021 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
16022 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
16023 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
16024 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
16025 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
16026 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
16027 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
16028 ; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21
16029 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
16030 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
16031 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16032 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16033 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16034 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20
16035 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
16036 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
16037 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
16038 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
16039 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
16040 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
16041 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
16042 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
16043 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
16044 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
16045 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
16046 ; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20
16047 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
16048 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
16049 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16050 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16051 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16052 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19
16053 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
16054 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
16055 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
16056 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
16057 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
16058 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
16059 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
16060 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
16061 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
16062 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
16063 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
16064 ; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19
16065 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
16066 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
16067 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16068 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
16069 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16070 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18
16071 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
16072 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
16073 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
16074 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
16075 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
16076 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
16077 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
16078 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
16079 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
16080 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
16081 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
16082 ; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18
16083 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
16084 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
16085 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16086 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16087 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16088 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17
16089 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
16090 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
16091 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
16092 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
16093 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
16094 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
16095 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
16096 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
16097 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
16098 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
16099 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
16100 ; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17
16101 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
16102 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
16103 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16104 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16105 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16106 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16
16107 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
16108 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
16109 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
16110 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
16111 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
16112 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
16113 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
16114 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
16115 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
16116 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
16117 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
16118 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
16119 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
16120 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
16121 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
16122 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
16123 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
16124 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
16125 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
16126 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
16127 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
16128 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
16129 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
16130 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
16131 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
16132 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
16133 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
16134 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
16135 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
16136 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
16137 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
16138 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
16139 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
16140 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
16141 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
16142 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
16143 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
16144 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
16145 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
16146 ; GFX8-NEXT: s_setpc_b64 s[30:31]
16148 ; GFX9-LABEL: v_fmul_v32bf16:
16150 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16151 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
16152 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
16153 ; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31
16154 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
16155 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
16156 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
16157 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
16158 ; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
16159 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
16160 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
16161 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
16162 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
16163 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
16164 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
16165 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
16166 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
16167 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
16168 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
16169 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
16170 ; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30
16171 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
16172 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
16173 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
16174 ; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
16175 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
16176 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
16177 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
16178 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
16179 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
16180 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
16181 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
16182 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
16183 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
16184 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
16185 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
16186 ; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29
16187 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
16188 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
16189 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
16190 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
16191 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
16192 ; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
16193 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
16194 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
16195 ; GFX9-NEXT: s_waitcnt vmcnt(0)
16196 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
16197 ; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34
16198 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
16199 ; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29
16200 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
16201 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
16202 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
16203 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
16204 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
16205 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
16206 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
16207 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
16208 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
16209 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
16210 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
16211 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
16212 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
16213 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
16214 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
16215 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
16216 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
16217 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
16218 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
16219 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
16220 ; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28
16221 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
16222 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
16223 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
16224 ; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
16225 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
16226 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
16227 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
16228 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
16229 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
16230 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
16231 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
16232 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
16233 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
16234 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
16235 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
16236 ; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27
16237 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
16238 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
16239 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
16240 ; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
16241 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
16242 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
16243 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
16244 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
16245 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
16246 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
16247 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
16248 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
16249 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
16250 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
16251 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
16252 ; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26
16253 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
16254 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
16255 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
16256 ; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
16257 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
16258 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
16259 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
16260 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
16261 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
16262 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
16263 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
16264 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
16265 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
16266 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
16267 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
16268 ; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25
16269 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
16270 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
16271 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
16272 ; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
16273 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
16274 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
16275 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
16276 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
16277 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
16278 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
16279 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
16280 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
16281 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
16282 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
16283 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
16284 ; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24
16285 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
16286 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
16287 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
16288 ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
16289 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
16290 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
16291 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
16292 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
16293 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
16294 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
16295 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
16296 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
16297 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
16298 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
16299 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
16300 ; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23
16301 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
16302 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
16303 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
16304 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
16305 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
16306 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
16307 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
16308 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
16309 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
16310 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
16311 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
16312 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
16313 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
16314 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
16315 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
16316 ; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22
16317 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16318 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16319 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
16320 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
16321 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
16322 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
16323 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
16324 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
16325 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
16326 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
16327 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
16328 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
16329 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
16330 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
16331 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
16332 ; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21
16333 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16334 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16335 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
16336 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
16337 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
16338 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
16339 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
16340 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
16341 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
16342 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
16343 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
16344 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
16345 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
16346 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
16347 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
16348 ; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20
16349 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16350 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16351 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
16352 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
16353 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
16354 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
16355 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
16356 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
16357 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
16358 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
16359 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
16360 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
16361 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
16362 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
16363 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
16364 ; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19
16365 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16366 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
16367 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
16368 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
16369 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
16370 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
16371 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
16372 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
16373 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
16374 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
16375 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
16376 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
16377 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
16378 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
16379 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
16380 ; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18
16381 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16382 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16383 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
16384 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
16385 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
16386 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
16387 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
16388 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
16389 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
16390 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
16391 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
16392 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
16393 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
16394 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
16395 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
16396 ; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17
16397 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16398 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16399 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
16400 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
16401 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
16402 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
16403 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
16404 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
16405 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
16406 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
16407 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
16408 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
16409 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
16410 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
16411 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
16412 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
16413 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
16414 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
16415 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
16416 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
16417 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
16418 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
16419 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
16420 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
16421 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
16422 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
16423 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
16424 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
16425 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
16426 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
16427 ; GFX9-NEXT: s_setpc_b64 s[30:31]
16429 ; GFX10-LABEL: v_fmul_v32bf16:
16431 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16432 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
16433 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
16434 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
16435 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
16436 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
16437 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
16438 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
16439 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
16440 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
16441 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
16442 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
16443 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
16444 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
16445 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
16446 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
16447 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
16448 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
16449 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
16450 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
16451 ; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
16452 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
16453 ; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
16454 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
16455 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
16456 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
16457 ; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
16458 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
16459 ; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
16460 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
16461 ; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
16462 ; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
16463 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
16464 ; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
16465 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
16466 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
16467 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
16468 ; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
16469 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
16470 ; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
16471 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
16472 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
16473 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
16474 ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
16475 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
16476 ; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27
16477 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
16478 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16479 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16480 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
16481 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
16482 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
16483 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
16484 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
16485 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
16486 ; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
16487 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
16488 ; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29
16489 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
16490 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16491 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
16492 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
16493 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
16494 ; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28
16495 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
16496 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16497 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16498 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
16499 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
16500 ; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
16501 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
16502 ; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51
16503 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
16504 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16505 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16506 ; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
16507 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
16508 ; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30
16509 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
16510 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16511 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16512 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
16513 ; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23
16514 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
16515 ; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22
16516 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
16517 ; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
16518 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
16519 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
16520 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16521 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16522 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
16523 ; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25
16524 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
16525 ; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24
16526 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
16527 ; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
16528 ; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
16529 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
16530 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
16531 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
16532 ; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26
16533 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
16534 ; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
16535 ; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
16536 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
16537 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
16538 ; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
16539 ; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
16540 ; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
16541 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
16542 ; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
16543 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
16544 ; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
16545 ; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
16546 ; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
16547 ; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
16548 ; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
16549 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
16550 ; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
16551 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
16552 ; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
16553 ; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
16554 ; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
16555 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
16556 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
16557 ; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
16558 ; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
16559 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
16560 ; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
16561 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
16562 ; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
16563 ; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
16564 ; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
16565 ; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
16566 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
16567 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
16568 ; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
16569 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
16570 ; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
16571 ; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
16572 ; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
16573 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
16574 ; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
16575 ; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
16576 ; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
16577 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
16578 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
16579 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
16580 ; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
16581 ; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
16582 ; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
16583 ; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
16584 ; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
16585 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
16586 ; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
16587 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
16588 ; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
16589 ; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
16590 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
16591 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
16592 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
16593 ; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
16594 ; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
16595 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
16596 ; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
16597 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
16598 ; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
16599 ; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
16600 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
16601 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
16602 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
16603 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
16604 ; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
16605 ; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
16606 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
16607 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
16608 ; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
16609 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
16610 ; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
16611 ; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
16612 ; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
16613 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
16614 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
16615 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
16616 ; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
16617 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
16618 ; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
16619 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
16620 ; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
16621 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
16622 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
16623 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
16624 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
16625 ; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
16626 ; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
16627 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
16628 ; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
16629 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
16630 ; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
16631 ; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
16632 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
16633 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
16634 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
16635 ; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
16636 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
16637 ; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
16638 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
16639 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
16640 ; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
16641 ; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
16642 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
16643 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
16644 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
16645 ; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
16646 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
16647 ; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
16648 ; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
16649 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
16650 ; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
16651 ; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
16652 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
16653 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
16654 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
16655 ; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
16656 ; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
16657 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
16658 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
16659 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
16660 ; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
16661 ; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
16662 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
16663 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
16664 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16665 ; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
16666 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
16667 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16668 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
16669 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
16670 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
16671 ; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
16672 ; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
16673 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
16674 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
16675 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
16676 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
16677 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
16678 ; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
16679 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
16680 ; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
16681 ; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
16682 ; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
16683 ; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
16684 ; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
16685 ; GFX10-NEXT: s_waitcnt vmcnt(0)
16686 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
16687 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
16688 ; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17
16689 ; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18
16690 ; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
16691 ; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
16692 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
16693 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16694 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
16695 ; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
16696 ; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
16697 ; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
16698 ; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
16699 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
16700 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
16701 ; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
16702 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
16703 ; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
16704 ; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
16705 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
16706 ; GFX10-NEXT: s_setpc_b64 s[30:31]
16708 ; GFX11-LABEL: v_fmul_v32bf16:
16710 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16711 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
16712 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
16713 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
16714 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16715 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16716 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
16717 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
16718 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16719 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16720 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
16721 ; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
16722 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
16723 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16724 ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
16725 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
16726 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
16727 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16728 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
16729 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
16730 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
16731 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
16732 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
16733 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
16734 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16735 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
16736 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
16737 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
16738 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16739 ; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
16740 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
16741 ; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
16742 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
16743 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
16744 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
16745 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16746 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
16747 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
16748 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
16749 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
16750 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
16751 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
16752 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
16753 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
16754 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16755 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
16756 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16757 ; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
16758 ; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
16759 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
16760 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
16761 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16762 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
16763 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
16764 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16765 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
16766 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
16767 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
16768 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16769 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
16770 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
16771 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
16772 ; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18
16773 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v16
16774 ; GFX11-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
16775 ; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23
16776 ; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
16777 ; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
16778 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
16779 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
16780 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
16781 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
16782 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
16783 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
16784 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
16785 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
16786 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16787 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
16788 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16789 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
16790 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
16791 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
16792 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
16793 ; GFX11-NEXT: v_mul_f32_e32 v4, v4, v20
16794 ; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71
16795 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
16796 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
16797 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
16798 ; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
16799 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
16800 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
16801 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
16802 ; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
16803 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
16804 ; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51
16805 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16806 ; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22
16807 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
16808 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
16809 ; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
16810 ; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
16811 ; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
16812 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
16813 ; GFX11-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
16814 ; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
16815 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16816 ; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37
16817 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
16818 ; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
16819 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
16820 ; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30
16821 ; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39
16822 ; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
16823 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
16824 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
16825 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
16826 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16827 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
16828 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
16829 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
16830 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
16831 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
16832 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
16833 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
16834 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
16835 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
16836 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
16837 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
16838 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
16839 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
16840 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
16841 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
16842 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
16843 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
16844 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
16845 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
16846 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
16847 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
16848 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
16849 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
16850 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
16851 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
16852 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
16853 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
16854 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
16855 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
16856 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
16857 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
16858 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
16859 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
16860 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
16861 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
16862 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
16863 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
16864 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
16865 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
16866 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
16867 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
16868 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
16869 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
16870 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
16871 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
16872 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
16873 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
16874 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
16875 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
16876 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
16877 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
16878 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
16879 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
16880 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
16881 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
16882 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
16883 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
16884 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
16885 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
16886 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
16887 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
16888 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
16889 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
16890 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
16891 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
16892 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
16893 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
16894 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
16895 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
16896 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
16897 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
16898 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
16899 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
16900 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
16901 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
16902 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
16903 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
16904 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
16905 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
16906 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
16907 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
16908 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
16909 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
16910 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
16911 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
16912 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
16913 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
16914 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
16915 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
16916 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
16917 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
16918 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
16919 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
16920 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
16921 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
16922 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
16923 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
16924 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
16925 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
16926 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
16927 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16928 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
16929 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
16930 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
16931 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
16932 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
16933 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
16934 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
16935 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
16936 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
16937 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
16938 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16939 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
16940 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
16941 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
16942 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
16943 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
16944 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
16945 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
16946 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
16947 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
16948 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
16949 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
16950 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16951 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
16952 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16953 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16954 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
16955 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
16956 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
16957 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
16958 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
16959 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
16960 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
16961 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
16962 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
16963 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
16964 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
16965 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
16966 ; GFX11-NEXT: s_waitcnt vmcnt(0)
16967 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
16968 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16969 ; GFX11-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
16970 ; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18
16971 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16972 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
16973 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
16974 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
16975 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16976 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
16977 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
16978 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
16979 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
16980 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
16981 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
16982 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
16983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
16984 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
16985 ; GFX11-NEXT: s_setpc_b64 s[30:31]
16986 %op = fmul <32 x bfloat> %a, %b
16987 ret <32 x bfloat> %op
16990 define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
16991 ; GCN-LABEL: v_fdiv_bf16:
16993 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16994 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
16995 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
16996 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16997 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16998 ; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
16999 ; GCN-NEXT: v_rcp_f32_e32 v3, v2
17000 ; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17001 ; GCN-NEXT: v_fma_f32 v3, v4, v3, v3
17002 ; GCN-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
17003 ; GCN-NEXT: v_mul_f32_e32 v5, v4, v3
17004 ; GCN-NEXT: v_fma_f32 v6, -v2, v5, v4
17005 ; GCN-NEXT: v_fma_f32 v5, v6, v3, v5
17006 ; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4
17007 ; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5
17008 ; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17009 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17010 ; GCN-NEXT: s_setpc_b64 s[30:31]
17012 ; GFX7-LABEL: v_fdiv_bf16:
17014 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17015 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17016 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17017 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17018 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17019 ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17020 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2
17021 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17022 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3
17023 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
17024 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
17025 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
17026 ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
17027 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
17028 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
17029 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17030 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17031 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17033 ; GFX8-LABEL: v_fdiv_bf16:
17035 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17036 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17037 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17038 ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17039 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
17040 ; GFX8-NEXT: v_rcp_f32_e32 v4, v2
17041 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0
17042 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4
17043 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4
17044 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3
17045 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5
17046 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
17047 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5
17048 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17049 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
17050 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
17051 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
17052 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
17053 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17054 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17055 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17056 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17058 ; GFX9-LABEL: v_fdiv_bf16:
17060 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17061 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17062 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17063 ; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17064 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
17065 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17066 ; GFX9-NEXT: v_rcp_f32_e32 v4, v2
17067 ; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0
17068 ; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4
17069 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4
17070 ; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3
17071 ; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5
17072 ; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3
17073 ; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5
17074 ; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17075 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
17076 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
17077 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
17078 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17079 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17080 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17081 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17083 ; GFX10-LABEL: v_fdiv_bf16:
17085 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17086 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17087 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17088 ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0
17089 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
17090 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
17091 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17092 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
17093 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3
17094 ; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5
17095 ; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3
17096 ; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5
17097 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4
17098 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17099 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
17100 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
17101 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17102 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17103 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17104 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17105 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17107 ; GFX11-LABEL: v_fdiv_bf16:
17109 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17110 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17111 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17112 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17113 ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
17114 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2
17115 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
17116 ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17117 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
17118 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3
17119 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
17120 ; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3
17121 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17122 ; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5
17123 ; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3
17124 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17125 ; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5
17126 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4
17127 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17128 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17129 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
17130 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
17131 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17132 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17133 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17134 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17135 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17136 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17137 ; GFX11-NEXT: s_setpc_b64 s[30:31]
17138 %op = fdiv bfloat %a, %b
17142 declare bfloat @llvm.fabs.bf16(bfloat)
17144 define bfloat @v_fabs_bf16(bfloat %a) {
17145 ; GCN-LABEL: v_fabs_bf16:
17147 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17148 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17149 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17150 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17151 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17152 ; GCN-NEXT: s_setpc_b64 s[30:31]
17154 ; GFX7-LABEL: v_fabs_bf16:
17156 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17157 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17158 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17159 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17160 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17161 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17163 ; GFX8-LABEL: v_fabs_bf16:
17165 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17166 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17167 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17169 ; GFX9-LABEL: v_fabs_bf16:
17171 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17172 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17173 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17175 ; GFX10-LABEL: v_fabs_bf16:
17177 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17178 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17179 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17181 ; GFX11TRUE16-LABEL: v_fabs_bf16:
17182 ; GFX11TRUE16: ; %bb.0:
17183 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17184 ; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
17185 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17187 ; GFX11FAKE16-LABEL: v_fabs_bf16:
17188 ; GFX11FAKE16: ; %bb.0:
17189 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17190 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17191 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17192 %op = call bfloat @llvm.fabs.bf16(bfloat %a)
17196 define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
17197 ; GCN-LABEL: s_fabs_bf16:
17199 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
17200 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
17201 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
17202 ; GCN-NEXT: ; return to shader part epilog
17204 ; GFX7-LABEL: s_fabs_bf16:
17206 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
17207 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
17208 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
17209 ; GFX7-NEXT: ; return to shader part epilog
17211 ; GFX8-LABEL: s_fabs_bf16:
17213 ; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
17214 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
17215 ; GFX8-NEXT: ; return to shader part epilog
17217 ; GFX9-LABEL: s_fabs_bf16:
17219 ; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
17220 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
17221 ; GFX9-NEXT: ; return to shader part epilog
17223 ; GFX10-LABEL: s_fabs_bf16:
17225 ; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
17226 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
17227 ; GFX10-NEXT: ; return to shader part epilog
17229 ; GFX11-LABEL: s_fabs_bf16:
17231 ; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
17232 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17233 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
17234 ; GFX11-NEXT: ; return to shader part epilog
17235 %op = call bfloat @llvm.fabs.bf16(bfloat %a)
17236 %cast = bitcast bfloat %op to i16
17237 %zext = zext i16 %cast to i32
17238 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17242 define bfloat @v_fneg_bf16(bfloat %a) {
17243 ; GCN-LABEL: v_fneg_bf16:
17245 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17246 ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17247 ; GCN-NEXT: s_setpc_b64 s[30:31]
17249 ; GFX7-LABEL: v_fneg_bf16:
17251 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17252 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17253 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17255 ; GFX8-LABEL: v_fneg_bf16:
17257 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17258 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17259 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17261 ; GFX9-LABEL: v_fneg_bf16:
17263 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17264 ; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17265 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17267 ; GFX10-LABEL: v_fneg_bf16:
17269 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17270 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17271 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17273 ; GFX11TRUE16-LABEL: v_fneg_bf16:
17274 ; GFX11TRUE16: ; %bb.0:
17275 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17276 ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
17277 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17279 ; GFX11FAKE16-LABEL: v_fneg_bf16:
17280 ; GFX11FAKE16: ; %bb.0:
17281 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17282 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17283 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17284 %op = fneg bfloat %a
17288 declare i32 @llvm.amdgcn.readfirstlane(i32)
17290 ; FIXME: readfirstlane hack for other bugs
17291 define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
17292 ; GCN-LABEL: s_fneg_bf16:
17294 ; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0
17295 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17296 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
17297 ; GCN-NEXT: ; return to shader part epilog
17299 ; GFX7-LABEL: s_fneg_bf16:
17301 ; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0
17302 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17303 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
17304 ; GFX7-NEXT: ; return to shader part epilog
17306 ; GFX8-LABEL: s_fneg_bf16:
17308 ; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
17309 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
17310 ; GFX8-NEXT: ; return to shader part epilog
17312 ; GFX9-LABEL: s_fneg_bf16:
17314 ; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
17315 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
17316 ; GFX9-NEXT: ; return to shader part epilog
17318 ; GFX10-LABEL: s_fneg_bf16:
17320 ; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
17321 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
17322 ; GFX10-NEXT: ; return to shader part epilog
17324 ; GFX11-LABEL: s_fneg_bf16:
17326 ; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
17327 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17328 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
17329 ; GFX11-NEXT: ; return to shader part epilog
17330 %op = fneg bfloat %a
17331 %cast = bitcast bfloat %op to i16
17332 %zext = zext i16 %cast to i32
17333 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17337 define bfloat @v_fneg_fabs_bf16(bfloat %a) {
17338 ; GCN-LABEL: v_fneg_fabs_bf16:
17340 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17341 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17342 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17343 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17344 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17345 ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17346 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17347 ; GCN-NEXT: s_setpc_b64 s[30:31]
17349 ; GFX7-LABEL: v_fneg_fabs_bf16:
17351 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17352 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17353 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17354 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17355 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17356 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17357 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17358 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17360 ; GFX8-LABEL: v_fneg_fabs_bf16:
17362 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17363 ; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
17364 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17366 ; GFX9-LABEL: v_fneg_fabs_bf16:
17368 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17369 ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
17370 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17372 ; GFX10-LABEL: v_fneg_fabs_bf16:
17374 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17375 ; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
17376 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17378 ; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
17379 ; GFX11TRUE16: ; %bb.0:
17380 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17381 ; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
17382 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17384 ; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
17385 ; GFX11FAKE16: ; %bb.0:
17386 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17387 ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
17388 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17389 %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
17390 %op = fneg bfloat %fabs
17394 ; FIXME: readfirstlane hack for other bugs
17395 define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
17396 ; GCN-LABEL: s_fneg_fabs_bf16:
17398 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
17399 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
17400 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
17401 ; GCN-NEXT: s_bitset0_b32 s0, 31
17402 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
17403 ; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
17404 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
17405 ; GCN-NEXT: ; return to shader part epilog
17407 ; GFX7-LABEL: s_fneg_fabs_bf16:
17409 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
17410 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
17411 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
17412 ; GFX7-NEXT: s_bitset0_b32 s0, 31
17413 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
17414 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
17415 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
17416 ; GFX7-NEXT: ; return to shader part epilog
17418 ; GFX8-LABEL: s_fneg_fabs_bf16:
17420 ; GFX8-NEXT: s_bitset1_b32 s0, 15
17421 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
17422 ; GFX8-NEXT: ; return to shader part epilog
17424 ; GFX9-LABEL: s_fneg_fabs_bf16:
17426 ; GFX9-NEXT: s_bitset1_b32 s0, 15
17427 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
17428 ; GFX9-NEXT: ; return to shader part epilog
17430 ; GFX10-LABEL: s_fneg_fabs_bf16:
17432 ; GFX10-NEXT: s_bitset1_b32 s0, 15
17433 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
17434 ; GFX10-NEXT: ; return to shader part epilog
17436 ; GFX11-LABEL: s_fneg_fabs_bf16:
17438 ; GFX11-NEXT: s_bitset1_b32 s0, 15
17439 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17440 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
17441 ; GFX11-NEXT: ; return to shader part epilog
17442 %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
17443 %op = fneg bfloat %fabs
17444 %cast = bitcast bfloat %op to i16
17445 %zext = zext i16 %cast to i32
17446 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17450 declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
17451 declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
17452 declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
17453 declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
17454 declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
17455 declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
17456 declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
17458 define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
17459 ; GCN-LABEL: v_minnum_bf16:
17461 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17462 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17463 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17464 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17465 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17466 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1
17467 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17468 ; GCN-NEXT: s_setpc_b64 s[30:31]
17470 ; GFX7-LABEL: v_minnum_bf16:
17472 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17473 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17474 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17475 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17476 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17477 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
17478 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17479 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17481 ; GFX8-LABEL: v_minnum_bf16:
17483 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17484 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17485 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17486 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
17487 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
17488 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
17489 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
17490 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
17491 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17492 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17493 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17494 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17496 ; GFX9-LABEL: v_minnum_bf16:
17498 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17499 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17500 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17501 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
17502 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
17503 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17504 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
17505 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
17506 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17507 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17508 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17509 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17511 ; GFX10-LABEL: v_minnum_bf16:
17513 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17514 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17515 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17516 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
17517 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
17518 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
17519 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17520 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17521 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17522 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17523 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17525 ; GFX11-LABEL: v_minnum_bf16:
17527 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17528 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17529 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17530 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17531 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
17532 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
17533 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
17534 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17535 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17536 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17537 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17539 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17540 ; GFX11-NEXT: s_setpc_b64 s[30:31]
17541 %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
17545 define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
17546 ; GCN-LABEL: v_minnum_v2bf16:
17548 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17549 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17550 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
17551 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17552 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
17553 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17554 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17555 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17556 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17557 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3
17558 ; GCN-NEXT: v_min_f32_e32 v0, v0, v2
17559 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17560 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17561 ; GCN-NEXT: s_setpc_b64 s[30:31]
17563 ; GFX7-LABEL: v_minnum_v2bf16:
17565 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17566 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17567 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
17568 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17569 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
17570 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17571 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17572 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17573 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17574 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
17575 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
17576 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17577 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17578 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17580 ; GFX8-LABEL: v_minnum_v2bf16:
17582 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17583 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17584 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17585 ; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
17586 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
17587 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
17588 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17589 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17590 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
17591 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
17592 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
17593 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
17594 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
17595 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
17596 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
17597 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
17598 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
17599 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17600 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
17601 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17602 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
17603 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17605 ; GFX9-LABEL: v_minnum_v2bf16:
17607 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17608 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17609 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17610 ; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
17611 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17612 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17613 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
17614 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17615 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
17616 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
17617 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
17618 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
17619 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
17620 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
17621 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
17622 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
17623 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17624 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
17625 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
17626 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
17627 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17629 ; GFX10-LABEL: v_minnum_v2bf16:
17631 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17632 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17633 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17634 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17635 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17636 ; GFX10-NEXT: v_min_f32_e32 v2, v3, v2
17637 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
17638 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
17639 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
17640 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
17641 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
17642 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
17643 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
17644 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
17645 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
17646 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17647 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
17648 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
17649 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17651 ; GFX11-LABEL: v_minnum_v2bf16:
17653 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17654 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17655 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17656 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17657 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17658 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
17659 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
17660 ; GFX11-NEXT: v_min_f32_e32 v2, v3, v2
17661 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17662 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
17663 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
17664 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
17665 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
17666 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
17667 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
17668 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
17669 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
17670 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
17671 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17672 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
17673 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17674 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
17675 ; GFX11-NEXT: s_setpc_b64 s[30:31]
17676 %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
17677 ret <2 x bfloat> %op
17680 define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
17681 ; GCN-LABEL: v_minnum_v3bf16:
17683 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17684 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17685 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
17686 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17687 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
17688 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
17689 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
17690 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17691 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17692 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17693 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17694 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17695 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17696 ; GCN-NEXT: v_min_f32_e32 v2, v2, v5
17697 ; GCN-NEXT: v_min_f32_e32 v1, v1, v4
17698 ; GCN-NEXT: v_min_f32_e32 v0, v0, v3
17699 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17700 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17701 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17702 ; GCN-NEXT: s_setpc_b64 s[30:31]
17704 ; GFX7-LABEL: v_minnum_v3bf16:
17706 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17707 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17708 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
17709 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17710 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
17711 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
17712 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
17713 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17714 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17715 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17716 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17717 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17718 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17719 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
17720 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
17721 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
17722 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17723 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17724 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17725 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17727 ; GFX8-LABEL: v_minnum_v3bf16:
17729 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17730 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17731 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17732 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
17733 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
17734 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
17735 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
17736 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
17737 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17738 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17739 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
17740 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
17741 ; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
17742 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
17743 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
17744 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
17745 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17746 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17747 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
17748 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
17749 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
17750 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
17751 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
17752 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
17753 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
17754 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
17755 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
17756 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17757 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
17758 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17759 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
17760 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
17761 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17763 ; GFX9-LABEL: v_minnum_v3bf16:
17765 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17766 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17767 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17768 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
17769 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
17770 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17771 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
17772 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
17773 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17774 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17775 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
17776 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
17777 ; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
17778 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17779 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17780 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
17781 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
17782 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
17783 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
17784 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
17785 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
17786 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
17787 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
17788 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
17789 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17790 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
17791 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
17792 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
17793 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
17794 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17796 ; GFX10-LABEL: v_minnum_v3bf16:
17798 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17799 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
17800 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17801 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17802 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17803 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17804 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17805 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
17806 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
17807 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
17808 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
17809 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
17810 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
17811 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
17812 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
17813 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
17814 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
17815 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
17816 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
17817 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17818 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17819 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17820 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17821 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
17822 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
17823 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17824 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
17825 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17827 ; GFX11TRUE16-LABEL: v_minnum_v3bf16:
17828 ; GFX11TRUE16: ; %bb.0:
17829 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17830 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
17831 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17832 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17833 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17834 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17835 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17836 ; GFX11TRUE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
17837 ; GFX11TRUE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
17838 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
17839 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
17840 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
17841 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
17842 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
17843 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
17844 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
17845 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
17846 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
17847 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
17848 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17849 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17850 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17851 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17852 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17853 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
17854 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
17855 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17856 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
17857 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
17858 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17860 ; GFX11FAKE16-LABEL: v_minnum_v3bf16:
17861 ; GFX11FAKE16: ; %bb.0:
17862 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17863 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
17864 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17865 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17866 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17867 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17868 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17869 ; GFX11FAKE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
17870 ; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
17871 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
17872 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
17873 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
17874 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
17875 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
17876 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
17877 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
17878 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
17879 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
17880 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
17881 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17882 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17883 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17884 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17885 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17886 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
17887 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
17888 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17889 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
17890 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
17891 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17892 %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
17893 ret <3 x bfloat> %op
17896 define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
17897 ; GCN-LABEL: v_minnum_v4bf16:
17899 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17900 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17901 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
17902 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17903 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
17904 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
17905 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
17906 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
17907 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
17908 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
17909 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17910 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
17911 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17912 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17913 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17914 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17915 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17916 ; GCN-NEXT: v_min_f32_e32 v3, v3, v7
17917 ; GCN-NEXT: v_min_f32_e32 v2, v2, v6
17918 ; GCN-NEXT: v_min_f32_e32 v1, v1, v5
17919 ; GCN-NEXT: v_min_f32_e32 v0, v0, v4
17920 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17921 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17922 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17923 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17924 ; GCN-NEXT: s_setpc_b64 s[30:31]
17926 ; GFX7-LABEL: v_minnum_v4bf16:
17928 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17929 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17930 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
17931 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17932 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
17933 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
17934 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
17935 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
17936 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
17937 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
17938 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17939 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
17940 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17941 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17942 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17943 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17944 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17945 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
17946 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
17947 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
17948 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
17949 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17950 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17951 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17952 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17953 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17955 ; GFX8-LABEL: v_minnum_v4bf16:
17957 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17958 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
17959 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
17960 ; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
17961 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
17962 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
17963 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17964 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17965 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
17966 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
17967 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
17968 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
17969 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
17970 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
17971 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
17972 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
17973 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
17974 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
17975 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17976 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
17977 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
17978 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17979 ; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
17980 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
17981 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
17982 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17983 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17984 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
17985 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
17986 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
17987 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
17988 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
17989 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
17990 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
17991 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
17992 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
17993 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17994 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
17995 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
17996 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17997 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
17998 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
17999 ; GFX8-NEXT: s_setpc_b64 s[30:31]
18001 ; GFX9-LABEL: v_minnum_v4bf16:
18003 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18004 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
18005 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
18006 ; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
18007 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18008 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18009 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
18010 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
18011 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
18012 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
18013 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
18014 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
18015 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
18016 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
18017 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
18018 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
18019 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18020 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
18021 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
18022 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
18023 ; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
18024 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18025 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18026 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
18027 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
18028 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
18029 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
18030 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18031 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
18032 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
18033 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
18034 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
18035 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18036 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
18037 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
18038 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
18039 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
18040 ; GFX9-NEXT: s_setpc_b64 s[30:31]
18042 ; GFX10-LABEL: v_minnum_v4bf16:
18044 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18045 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
18046 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
18047 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18048 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18049 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
18050 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
18051 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
18052 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18053 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18054 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
18055 ; GFX10-NEXT: v_min_f32_e32 v3, v7, v6
18056 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
18057 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
18058 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
18059 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
18060 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
18061 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
18062 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
18063 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
18064 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
18065 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
18066 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
18067 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
18068 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18069 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
18070 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
18071 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
18072 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
18073 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18074 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
18075 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18076 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
18077 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
18078 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18079 ; GFX10-NEXT: s_setpc_b64 s[30:31]
18081 ; GFX11-LABEL: v_minnum_v4bf16:
18083 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18084 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
18085 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
18086 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18087 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18088 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
18089 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
18090 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18091 ; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
18092 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18093 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
18094 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18095 ; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
18096 ; GFX11-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
18097 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
18098 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
18099 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
18100 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
18101 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
18102 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
18103 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
18104 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
18105 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
18106 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
18107 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
18108 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
18109 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
18110 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18111 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
18112 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
18113 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
18114 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18115 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18116 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
18117 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18118 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
18119 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
18120 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
18121 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18122 ; GFX11-NEXT: s_setpc_b64 s[30:31]
18123 %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
18124 ret <4 x bfloat> %op
18127 define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
18128 ; GCN-LABEL: v_minnum_v8bf16:
18130 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18131 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18132 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
18133 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
18134 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
18135 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
18136 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
18137 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
18138 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
18139 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
18140 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
18141 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
18142 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
18143 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
18144 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
18145 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
18146 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
18147 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18148 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18149 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18150 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18151 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18152 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18153 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18154 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18155 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18156 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18157 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18158 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18159 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18160 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18161 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18162 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18163 ; GCN-NEXT: v_min_f32_e32 v7, v7, v15
18164 ; GCN-NEXT: v_min_f32_e32 v6, v6, v14
18165 ; GCN-NEXT: v_min_f32_e32 v5, v5, v13
18166 ; GCN-NEXT: v_min_f32_e32 v4, v4, v12
18167 ; GCN-NEXT: v_min_f32_e32 v3, v3, v11
18168 ; GCN-NEXT: v_min_f32_e32 v2, v2, v10
18169 ; GCN-NEXT: v_min_f32_e32 v1, v1, v9
18170 ; GCN-NEXT: v_min_f32_e32 v0, v0, v8
18171 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18172 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18173 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18174 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18175 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18176 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18177 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18178 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18179 ; GCN-NEXT: s_setpc_b64 s[30:31]
18181 ; GFX7-LABEL: v_minnum_v8bf16:
18183 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18184 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18185 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
18186 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
18187 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
18188 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
18189 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
18190 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
18191 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
18192 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
18193 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
18194 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
18195 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
18196 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
18197 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
18198 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
18199 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
18200 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18201 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18202 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18203 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18204 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18205 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18206 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18207 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18208 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18209 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18210 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18211 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18212 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18213 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18214 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18215 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18216 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
18217 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
18218 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
18219 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v12
18220 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v11
18221 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v10
18222 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
18223 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
18224 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18225 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18226 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18227 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18228 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18229 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18230 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18231 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18232 ; GFX7-NEXT: s_setpc_b64 s[30:31]
18234 ; GFX8-LABEL: v_minnum_v8bf16:
18236 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18237 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18238 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18239 ; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
18240 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
18241 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
18242 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18243 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18244 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
18245 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v7
18246 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
18247 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
18248 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
18249 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
18250 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
18251 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
18252 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
18253 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
18254 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18255 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
18256 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
18257 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
18258 ; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
18259 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
18260 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
18261 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18262 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18263 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18264 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v6
18265 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
18266 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18267 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
18268 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
18269 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
18270 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
18271 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
18272 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
18273 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
18274 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18275 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
18276 ; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
18277 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
18278 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
18279 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18280 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18281 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18282 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v5
18283 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
18284 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18285 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
18286 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
18287 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
18288 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
18289 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
18290 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18291 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
18292 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
18293 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
18294 ; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
18295 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
18296 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
18297 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18298 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18299 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18300 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v4
18301 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
18302 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18303 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
18304 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
18305 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
18306 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
18307 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
18308 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18309 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
18310 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
18311 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
18312 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
18313 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
18314 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
18315 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
18316 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
18317 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
18318 ; GFX8-NEXT: s_setpc_b64 s[30:31]
18320 ; GFX9-LABEL: v_minnum_v8bf16:
18322 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18323 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18324 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18325 ; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
18326 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18327 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18328 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
18329 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
18330 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
18331 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
18332 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
18333 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
18334 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
18335 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
18336 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
18337 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
18338 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18339 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
18340 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
18341 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
18342 ; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
18343 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18344 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18345 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
18346 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v6
18347 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
18348 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
18349 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18350 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
18351 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
18352 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
18353 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
18354 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
18355 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
18356 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18357 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
18358 ; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
18359 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18360 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18361 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
18362 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
18363 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
18364 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
18365 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18366 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
18367 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
18368 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
18369 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
18370 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18371 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
18372 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
18373 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
18374 ; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
18375 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18376 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18377 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
18378 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
18379 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
18380 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
18381 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18382 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
18383 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
18384 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
18385 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
18386 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18387 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
18388 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
18389 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
18390 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
18391 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
18392 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
18393 ; GFX9-NEXT: s_setpc_b64 s[30:31]
18395 ; GFX10-LABEL: v_minnum_v8bf16:
18397 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18398 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18399 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18400 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18401 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18402 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
18403 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18404 ; GFX10-NEXT: v_min_f32_e32 v8, v9, v8
18405 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
18406 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18407 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
18408 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
18409 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
18410 ; GFX10-NEXT: v_min_f32_e32 v7, v10, v9
18411 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
18412 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
18413 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
18414 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
18415 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
18416 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
18417 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18418 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
18419 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
18420 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
18421 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
18422 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
18423 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
18424 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
18425 ; GFX10-NEXT: v_min_f32_e32 v6, v10, v6
18426 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
18427 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18428 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18429 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
18430 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
18431 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
18432 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
18433 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18434 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18435 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
18436 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v5
18437 ; GFX10-NEXT: v_min_f32_e32 v5, v15, v13
18438 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
18439 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
18440 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
18441 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
18442 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
18443 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
18444 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
18445 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
18446 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
18447 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
18448 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
18449 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
18450 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
18451 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
18452 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
18453 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
18454 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
18455 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
18456 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
18457 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18458 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
18459 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18460 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
18461 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
18462 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18463 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18464 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
18465 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
18466 ; GFX10-NEXT: s_setpc_b64 s[30:31]
18468 ; GFX11-LABEL: v_minnum_v8bf16:
18470 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18471 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
18472 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18473 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18474 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18475 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
18476 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18477 ; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
18478 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
18479 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
18480 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18481 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
18482 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18483 ; GFX11-NEXT: v_min_f32_e32 v3, v3, v7
18484 ; GFX11-NEXT: v_min_f32_e32 v7, v10, v9
18485 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
18486 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
18487 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18488 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
18489 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
18490 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
18491 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
18492 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
18493 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
18494 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
18495 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
18496 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
18497 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18498 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
18499 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18500 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
18501 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
18502 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18503 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18504 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
18505 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18506 ; GFX11-NEXT: v_min_f32_e32 v6, v10, v6
18507 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
18508 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
18509 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
18510 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
18511 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
18512 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
18513 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
18514 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
18515 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
18516 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18517 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18518 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
18519 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
18520 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v4
18521 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
18522 ; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
18523 ; GFX11-NEXT: v_min_f32_e32 v5, v15, v13
18524 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18525 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
18526 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
18527 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
18528 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
18529 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18530 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
18531 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
18532 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
18533 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
18534 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
18535 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
18536 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
18537 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
18538 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18539 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
18540 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18541 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
18542 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
18543 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
18544 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18545 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18546 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
18547 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
18548 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
18549 ; GFX11-NEXT: s_setpc_b64 s[30:31]
18550 %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
18551 ret <8 x bfloat> %op
18554 define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
18555 ; GCN-LABEL: v_minnum_v16bf16:
18557 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18558 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
18559 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
18560 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
18561 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18562 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30
18563 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
18564 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
18565 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
18566 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18567 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29
18568 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
18569 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
18570 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
18571 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18572 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28
18573 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
18574 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
18575 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
18576 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
18577 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
18578 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
18579 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
18580 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
18581 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
18582 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
18583 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
18584 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
18585 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
18586 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
18587 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
18588 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
18589 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
18590 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
18591 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
18592 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
18593 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
18594 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
18595 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18596 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
18597 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
18598 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
18599 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18600 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27
18601 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
18602 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
18603 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18604 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
18605 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18606 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
18607 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18608 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
18609 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18610 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
18611 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18612 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
18613 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18614 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
18615 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18616 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
18617 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18618 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
18619 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18620 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
18621 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18622 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
18623 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18624 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18625 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26
18626 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25
18627 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24
18628 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23
18629 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22
18630 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21
18631 ; GCN-NEXT: v_min_f32_e32 v4, v4, v20
18632 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19
18633 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18
18634 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17
18635 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16
18636 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18637 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18638 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18639 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18640 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18641 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18642 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18643 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18644 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18645 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18646 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18647 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18648 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18649 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18650 ; GCN-NEXT: s_waitcnt vmcnt(0)
18651 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
18652 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
18653 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16
18654 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18655 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18656 ; GCN-NEXT: s_setpc_b64 s[30:31]
18658 ; GFX7-LABEL: v_minnum_v16bf16:
18660 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18661 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
18662 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
18663 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
18664 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18665 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
18666 ; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
18667 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
18668 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
18669 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
18670 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18671 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
18672 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
18673 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
18674 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
18675 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
18676 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
18677 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
18678 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
18679 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
18680 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
18681 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
18682 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
18683 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
18684 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
18685 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
18686 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
18687 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
18688 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
18689 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
18690 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
18691 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
18692 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
18693 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
18694 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
18695 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
18696 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
18697 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18698 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
18699 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
18700 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18701 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
18702 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18703 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
18704 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18705 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
18706 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18707 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
18708 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18709 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
18710 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18711 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
18712 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18713 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18714 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
18715 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18716 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
18717 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18718 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
18719 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18720 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
18721 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18722 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
18723 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18724 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
18725 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18726 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
18727 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
18728 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
18729 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
18730 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
18731 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
18732 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
18733 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
18734 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
18735 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
18736 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
18737 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
18738 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
18739 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18740 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18741 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18742 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18743 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18744 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18745 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18746 ; GFX7-NEXT: s_waitcnt vmcnt(0)
18747 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
18748 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
18749 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
18750 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18751 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18752 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18753 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18754 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18755 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18756 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18757 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18758 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18759 ; GFX7-NEXT: s_setpc_b64 s[30:31]
18761 ; GFX8-LABEL: v_minnum_v16bf16:
18763 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18764 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
18765 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
18766 ; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
18767 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
18768 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
18769 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
18770 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18771 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18772 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18773 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
18774 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
18775 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
18776 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
18777 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
18778 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
18779 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
18780 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
18781 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18782 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
18783 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
18784 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
18785 ; GFX8-NEXT: v_min_f32_e32 v15, v17, v15
18786 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
18787 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
18788 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18789 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18790 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18791 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v14
18792 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
18793 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
18794 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
18795 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
18796 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
18797 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
18798 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
18799 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18800 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
18801 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
18802 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
18803 ; GFX8-NEXT: v_min_f32_e32 v14, v17, v14
18804 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
18805 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
18806 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18807 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18808 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18809 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v13
18810 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
18811 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
18812 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
18813 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
18814 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
18815 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
18816 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
18817 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18818 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
18819 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
18820 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
18821 ; GFX8-NEXT: v_min_f32_e32 v13, v17, v13
18822 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
18823 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
18824 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18825 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18826 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18827 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v12
18828 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
18829 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
18830 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
18831 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
18832 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
18833 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
18834 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
18835 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
18836 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
18837 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
18838 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
18839 ; GFX8-NEXT: v_min_f32_e32 v12, v17, v12
18840 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
18841 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
18842 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18843 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18844 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18845 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v11
18846 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
18847 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
18848 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
18849 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
18850 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
18851 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
18852 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
18853 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18854 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
18855 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
18856 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
18857 ; GFX8-NEXT: v_min_f32_e32 v11, v17, v11
18858 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
18859 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
18860 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18861 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18862 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18863 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v10
18864 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
18865 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
18866 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
18867 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
18868 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
18869 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
18870 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
18871 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
18872 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
18873 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
18874 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
18875 ; GFX8-NEXT: v_min_f32_e32 v10, v17, v10
18876 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
18877 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
18878 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18879 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18880 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18881 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v9
18882 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
18883 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
18884 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
18885 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
18886 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
18887 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18888 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
18889 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18890 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
18891 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
18892 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
18893 ; GFX8-NEXT: v_min_f32_e32 v9, v17, v9
18894 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
18895 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
18896 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18897 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18898 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18899 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v8
18900 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
18901 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
18902 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
18903 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
18904 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
18905 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
18906 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
18907 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18908 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
18909 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
18910 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
18911 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
18912 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
18913 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
18914 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
18915 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
18916 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
18917 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
18918 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
18919 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
18920 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
18921 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
18922 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
18923 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
18924 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
18925 ; GFX8-NEXT: s_setpc_b64 s[30:31]
18927 ; GFX9-LABEL: v_minnum_v16bf16:
18929 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18930 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
18931 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
18932 ; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
18933 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18934 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18935 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
18936 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
18937 ; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
18938 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
18939 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
18940 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
18941 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
18942 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
18943 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
18944 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
18945 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18946 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
18947 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
18948 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
18949 ; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
18950 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18951 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18952 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
18953 ; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
18954 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
18955 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
18956 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
18957 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
18958 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
18959 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
18960 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
18961 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18962 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
18963 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
18964 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
18965 ; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
18966 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18967 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18968 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
18969 ; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
18970 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
18971 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
18972 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
18973 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
18974 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
18975 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
18976 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
18977 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18978 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
18979 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
18980 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
18981 ; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
18982 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18983 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18984 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
18985 ; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
18986 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
18987 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
18988 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
18989 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
18990 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
18991 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
18992 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
18993 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
18994 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
18995 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
18996 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
18997 ; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
18998 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18999 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19000 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
19001 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
19002 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
19003 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
19004 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
19005 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
19006 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
19007 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
19008 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
19009 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
19010 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
19011 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
19012 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
19013 ; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
19014 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19015 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19016 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
19017 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
19018 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
19019 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
19020 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
19021 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
19022 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
19023 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
19024 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
19025 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
19026 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
19027 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
19028 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
19029 ; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
19030 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19031 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19032 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
19033 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
19034 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
19035 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
19036 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
19037 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
19038 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
19039 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
19040 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
19041 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
19042 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
19043 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
19044 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
19045 ; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
19046 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19047 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19048 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
19049 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
19050 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
19051 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
19052 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
19053 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
19054 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
19055 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
19056 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
19057 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
19058 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
19059 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
19060 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
19061 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
19062 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
19063 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
19064 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
19065 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
19066 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
19067 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
19068 ; GFX9-NEXT: s_setpc_b64 s[30:31]
19070 ; GFX10-LABEL: v_minnum_v16bf16:
19072 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19073 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
19074 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
19075 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19076 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19077 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
19078 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19079 ; GFX10-NEXT: v_min_f32_e32 v16, v17, v16
19080 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
19081 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
19082 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19083 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
19084 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
19085 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
19086 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
19087 ; GFX10-NEXT: v_min_f32_e32 v17, v18, v17
19088 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
19089 ; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
19090 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
19091 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
19092 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
19093 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
19094 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
19095 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
19096 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
19097 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
19098 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19099 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
19100 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
19101 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
19102 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19103 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19104 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
19105 ; GFX10-NEXT: v_min_f32_e32 v17, v20, v19
19106 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
19107 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v13
19108 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
19109 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
19110 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
19111 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
19112 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
19113 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
19114 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
19115 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19116 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19117 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
19118 ; GFX10-NEXT: v_min_f32_e32 v13, v19, v18
19119 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
19120 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
19121 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19122 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
19123 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
19124 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
19125 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
19126 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
19127 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
19128 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
19129 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
19130 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
19131 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19132 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
19133 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
19134 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19135 ; GFX10-NEXT: v_min_f32_e32 v12, v18, v12
19136 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
19137 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
19138 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
19139 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
19140 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
19141 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
19142 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
19143 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
19144 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
19145 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
19146 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19147 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
19148 ; GFX10-NEXT: v_min_f32_e32 v18, v19, v18
19149 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19150 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
19151 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
19152 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
19153 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
19154 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
19155 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
19156 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
19157 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
19158 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
19159 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
19160 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19161 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
19162 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
19163 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
19164 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
19165 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19166 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
19167 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
19168 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
19169 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
19170 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
19171 ; GFX10-NEXT: v_min_f32_e32 v19, v22, v20
19172 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
19173 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
19174 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19175 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19176 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
19177 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
19178 ; GFX10-NEXT: v_min_f32_e32 v9, v22, v20
19179 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
19180 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v8
19181 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
19182 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
19183 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
19184 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
19185 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
19186 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
19187 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
19188 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
19189 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
19190 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
19191 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
19192 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
19193 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
19194 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
19195 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
19196 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
19197 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
19198 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
19199 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
19200 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
19201 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
19202 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
19203 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
19204 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
19205 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
19206 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
19207 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
19208 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
19209 ; GFX10-NEXT: s_setpc_b64 s[30:31]
19211 ; GFX11-LABEL: v_minnum_v16bf16:
19213 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19214 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
19215 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
19216 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19217 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
19218 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
19219 ; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
19220 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
19221 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19222 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
19223 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
19224 ; GFX11-NEXT: v_min_f32_e32 v17, v18, v17
19225 ; GFX11-NEXT: v_min_f32_e32 v6, v6, v14
19226 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
19227 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
19228 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
19229 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
19230 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19231 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
19232 ; GFX11-NEXT: v_min_f32_e32 v7, v7, v15
19233 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
19234 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
19235 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
19236 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
19237 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
19238 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
19239 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
19240 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
19241 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
19242 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
19243 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
19244 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
19245 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
19246 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19247 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
19248 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
19249 ; GFX11-NEXT: v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
19250 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
19251 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
19252 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
19253 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19254 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19255 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19256 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
19257 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
19258 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
19259 ; GFX11-NEXT: v_min_f32_e32 v4, v4, v12
19260 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
19261 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19262 ; GFX11-NEXT: v_min_f32_e32 v5, v5, v13
19263 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
19264 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
19265 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
19266 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
19267 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
19268 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19269 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
19270 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
19271 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
19272 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
19273 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
19274 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
19275 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
19276 ; GFX11-NEXT: v_min_f32_e32 v12, v18, v12
19277 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
19278 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
19279 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
19280 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
19281 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
19282 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19283 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
19284 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
19285 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
19286 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
19287 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
19288 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
19289 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
19290 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
19291 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
19292 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
19293 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
19294 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19295 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19296 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
19297 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
19298 ; GFX11-NEXT: v_min_f32_e32 v18, v19, v18
19299 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
19300 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
19301 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19302 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19303 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
19304 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
19305 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19306 ; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
19307 ; GFX11-NEXT: v_min_f32_e32 v3, v3, v11
19308 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
19309 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
19310 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
19311 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
19312 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
19313 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
19314 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
19315 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
19316 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
19317 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
19318 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
19319 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
19320 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
19321 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
19322 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
19323 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
19324 ; GFX11-NEXT: v_min_f32_e32 v19, v22, v20
19325 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
19326 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
19327 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19328 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
19329 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
19330 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19331 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19332 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
19333 ; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9
19334 ; GFX11-NEXT: v_min_f32_e32 v9, v22, v20
19335 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
19336 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
19337 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
19338 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
19339 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
19340 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
19341 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
19342 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
19343 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
19344 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
19345 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
19346 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
19347 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
19348 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
19349 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
19350 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
19351 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
19352 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
19353 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
19354 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
19355 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
19356 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
19357 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
19358 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
19359 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
19360 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
19361 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
19362 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
19363 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
19364 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
19365 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
19366 ; GFX11-NEXT: s_setpc_b64 s[30:31]
19367 %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
19368 ret <16 x bfloat> %op
19371 define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
19372 ; GCN-LABEL: v_minnum_v32bf16:
19374 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19375 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
19376 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
19377 ; GCN-NEXT: s_waitcnt vmcnt(1)
19378 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
19379 ; GCN-NEXT: s_waitcnt vmcnt(0)
19380 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
19381 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19382 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19383 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
19384 ; GCN-NEXT: v_min_f32_e32 v31, v31, v32
19385 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
19386 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19387 ; GCN-NEXT: s_waitcnt vmcnt(0)
19388 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19389 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19390 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
19391 ; GCN-NEXT: v_min_f32_e32 v30, v30, v32
19392 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
19393 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19394 ; GCN-NEXT: s_waitcnt vmcnt(0)
19395 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19396 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19397 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
19398 ; GCN-NEXT: v_min_f32_e32 v29, v29, v32
19399 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
19400 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19401 ; GCN-NEXT: s_waitcnt vmcnt(0)
19402 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19403 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19404 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
19405 ; GCN-NEXT: v_min_f32_e32 v28, v28, v32
19406 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
19407 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19408 ; GCN-NEXT: s_waitcnt vmcnt(0)
19409 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19410 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19411 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
19412 ; GCN-NEXT: v_min_f32_e32 v27, v27, v32
19413 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
19414 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19415 ; GCN-NEXT: s_waitcnt vmcnt(0)
19416 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19417 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19418 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
19419 ; GCN-NEXT: v_min_f32_e32 v26, v26, v32
19420 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
19421 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19422 ; GCN-NEXT: s_waitcnt vmcnt(0)
19423 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19424 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19425 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
19426 ; GCN-NEXT: v_min_f32_e32 v25, v25, v32
19427 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
19428 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19429 ; GCN-NEXT: s_waitcnt vmcnt(0)
19430 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19431 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19432 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
19433 ; GCN-NEXT: v_min_f32_e32 v24, v24, v32
19434 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
19435 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19436 ; GCN-NEXT: s_waitcnt vmcnt(0)
19437 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19438 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19439 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
19440 ; GCN-NEXT: v_min_f32_e32 v23, v23, v32
19441 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
19442 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19443 ; GCN-NEXT: s_waitcnt vmcnt(0)
19444 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19445 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19446 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
19447 ; GCN-NEXT: v_min_f32_e32 v22, v22, v32
19448 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
19449 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19450 ; GCN-NEXT: s_waitcnt vmcnt(0)
19451 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19452 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19453 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
19454 ; GCN-NEXT: v_min_f32_e32 v21, v21, v32
19455 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
19456 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19457 ; GCN-NEXT: s_waitcnt vmcnt(0)
19458 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19459 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19460 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
19461 ; GCN-NEXT: v_min_f32_e32 v20, v20, v32
19462 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
19463 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19464 ; GCN-NEXT: s_waitcnt vmcnt(0)
19465 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19466 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19467 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
19468 ; GCN-NEXT: v_min_f32_e32 v19, v19, v32
19469 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
19470 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19471 ; GCN-NEXT: s_waitcnt vmcnt(0)
19472 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19473 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19474 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
19475 ; GCN-NEXT: v_min_f32_e32 v18, v18, v32
19476 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
19477 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19478 ; GCN-NEXT: s_waitcnt vmcnt(0)
19479 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19480 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19481 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
19482 ; GCN-NEXT: v_min_f32_e32 v17, v17, v32
19483 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
19484 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19485 ; GCN-NEXT: s_waitcnt vmcnt(0)
19486 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19487 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19488 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
19489 ; GCN-NEXT: v_min_f32_e32 v16, v16, v32
19490 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
19491 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19492 ; GCN-NEXT: s_waitcnt vmcnt(0)
19493 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19494 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19495 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
19496 ; GCN-NEXT: v_min_f32_e32 v15, v15, v32
19497 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
19498 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19499 ; GCN-NEXT: s_waitcnt vmcnt(0)
19500 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19501 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19502 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
19503 ; GCN-NEXT: v_min_f32_e32 v14, v14, v32
19504 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
19505 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19506 ; GCN-NEXT: s_waitcnt vmcnt(0)
19507 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19508 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19509 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
19510 ; GCN-NEXT: v_min_f32_e32 v13, v13, v32
19511 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
19512 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19513 ; GCN-NEXT: s_waitcnt vmcnt(0)
19514 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19515 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19516 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
19517 ; GCN-NEXT: v_min_f32_e32 v12, v12, v32
19518 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
19519 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19520 ; GCN-NEXT: s_waitcnt vmcnt(0)
19521 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19522 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19523 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
19524 ; GCN-NEXT: v_min_f32_e32 v11, v11, v32
19525 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
19526 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19527 ; GCN-NEXT: s_waitcnt vmcnt(0)
19528 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19529 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19530 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
19531 ; GCN-NEXT: v_min_f32_e32 v10, v10, v32
19532 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
19533 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19534 ; GCN-NEXT: s_waitcnt vmcnt(0)
19535 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19536 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19537 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
19538 ; GCN-NEXT: v_min_f32_e32 v9, v9, v32
19539 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
19540 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19541 ; GCN-NEXT: s_waitcnt vmcnt(0)
19542 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19543 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19544 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
19545 ; GCN-NEXT: v_min_f32_e32 v8, v8, v32
19546 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
19547 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19548 ; GCN-NEXT: s_waitcnt vmcnt(0)
19549 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19550 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19551 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
19552 ; GCN-NEXT: v_min_f32_e32 v7, v7, v32
19553 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
19554 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19555 ; GCN-NEXT: s_waitcnt vmcnt(0)
19556 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19557 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19558 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
19559 ; GCN-NEXT: v_min_f32_e32 v6, v6, v32
19560 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
19561 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19562 ; GCN-NEXT: s_waitcnt vmcnt(0)
19563 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19564 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19565 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
19566 ; GCN-NEXT: v_min_f32_e32 v5, v5, v32
19567 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
19568 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19569 ; GCN-NEXT: s_waitcnt vmcnt(0)
19570 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19571 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19572 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
19573 ; GCN-NEXT: v_min_f32_e32 v4, v4, v32
19574 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
19575 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19576 ; GCN-NEXT: s_waitcnt vmcnt(0)
19577 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19578 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19579 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
19580 ; GCN-NEXT: v_min_f32_e32 v3, v3, v32
19581 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
19582 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19583 ; GCN-NEXT: s_waitcnt vmcnt(0)
19584 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19585 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19586 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
19587 ; GCN-NEXT: v_min_f32_e32 v2, v2, v32
19588 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
19589 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19590 ; GCN-NEXT: s_waitcnt vmcnt(0)
19591 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19592 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19593 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
19594 ; GCN-NEXT: v_min_f32_e32 v1, v1, v32
19595 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
19596 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19597 ; GCN-NEXT: s_waitcnt vmcnt(0)
19598 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19599 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19600 ; GCN-NEXT: v_min_f32_e32 v0, v0, v32
19601 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19602 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19603 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19604 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19605 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19606 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19607 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19608 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19609 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19610 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19611 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19612 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19613 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19614 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19615 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19616 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19617 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19618 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19619 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19620 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19621 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19622 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19623 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19624 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19625 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19626 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19627 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19628 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19629 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19630 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19631 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19632 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19633 ; GCN-NEXT: s_setpc_b64 s[30:31]
19635 ; GFX7-LABEL: v_minnum_v32bf16:
19637 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19638 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
19639 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
19640 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
19641 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19642 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
19643 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19644 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
19645 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19646 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
19647 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19648 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
19649 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19650 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
19651 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19652 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
19653 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19654 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
19655 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19656 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
19657 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19658 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
19659 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19660 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
19661 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19662 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
19663 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19664 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
19665 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19666 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
19667 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19668 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
19669 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19670 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
19671 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19672 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
19673 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19674 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
19675 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19676 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
19677 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19678 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
19679 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19680 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
19681 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19682 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
19683 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19684 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
19685 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19686 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
19687 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19688 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
19689 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19690 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
19691 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19692 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
19693 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19694 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
19695 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19696 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
19697 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19698 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
19699 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19700 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
19701 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19702 ; GFX7-NEXT: s_waitcnt vmcnt(1)
19703 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
19704 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19705 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19706 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19707 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19708 ; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
19709 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
19710 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19711 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19712 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19713 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19714 ; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
19715 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
19716 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19717 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19718 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19719 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19720 ; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
19721 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
19722 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19723 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19724 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19725 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19726 ; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
19727 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
19728 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19729 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19730 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19731 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19732 ; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
19733 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
19734 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19735 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19736 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19737 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19738 ; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
19739 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
19740 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19741 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19742 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19743 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19744 ; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
19745 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
19746 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19747 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19748 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19749 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19750 ; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
19751 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
19752 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19753 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19754 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19755 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19756 ; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
19757 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
19758 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19759 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19760 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19761 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19762 ; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
19763 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
19764 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19765 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19766 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19767 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19768 ; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
19769 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
19770 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19771 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19772 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19773 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19774 ; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
19775 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
19776 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19777 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19778 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19779 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19780 ; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
19781 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
19782 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19783 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19784 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19785 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19786 ; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
19787 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
19788 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19789 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19790 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19791 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19792 ; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
19793 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
19794 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19795 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19796 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19797 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19798 ; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
19799 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
19800 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19801 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19802 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19803 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19804 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
19805 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
19806 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19807 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19808 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19809 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19810 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
19811 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
19812 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19813 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19814 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19815 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19816 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
19817 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
19818 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19819 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19820 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19821 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19822 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
19823 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
19824 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19825 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19826 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19827 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19828 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
19829 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
19830 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19831 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19832 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19833 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19834 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
19835 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
19836 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19837 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19838 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19839 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19840 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
19841 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
19842 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19843 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19844 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19845 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19846 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
19847 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
19848 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19849 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19850 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19851 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19852 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
19853 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
19854 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19855 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19856 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19857 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19858 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
19859 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
19860 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19861 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19862 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19863 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19864 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
19865 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
19866 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19867 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19868 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19869 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19870 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
19871 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
19872 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19873 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19874 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19875 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19876 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
19877 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
19878 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19879 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19880 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19881 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19882 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
19883 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
19884 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19885 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19886 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19887 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19888 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
19889 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
19890 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19891 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19892 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19893 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19894 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
19895 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19896 ; GFX7-NEXT: s_setpc_b64 s[30:31]
19898 ; GFX8-LABEL: v_minnum_v32bf16:
19900 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19901 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
19902 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
19903 ; GFX8-NEXT: v_min_f32_e32 v31, v32, v31
19904 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
19905 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
19906 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
19907 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19908 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19909 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
19910 ; GFX8-NEXT: v_min_f32_e32 v14, v14, v30
19911 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
19912 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
19913 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
19914 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
19915 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
19916 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
19917 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
19918 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
19919 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
19920 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
19921 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
19922 ; GFX8-NEXT: v_min_f32_e32 v32, v32, v30
19923 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
19924 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
19925 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19926 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19927 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19928 ; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
19929 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
19930 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
19931 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
19932 ; GFX8-NEXT: s_waitcnt vmcnt(0)
19933 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
19934 ; GFX8-NEXT: v_min_f32_e32 v33, v33, v34
19935 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19936 ; GFX8-NEXT: v_min_f32_e32 v30, v15, v30
19937 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
19938 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
19939 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
19940 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
19941 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
19942 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
19943 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
19944 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
19945 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19946 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
19947 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
19948 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
19949 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
19950 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
19951 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19952 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
19953 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
19954 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
19955 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
19956 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
19957 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
19958 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
19959 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
19960 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
19961 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
19962 ; GFX8-NEXT: v_min_f32_e32 v29, v33, v29
19963 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
19964 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
19965 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19966 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19967 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19968 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
19969 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
19970 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
19971 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
19972 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
19973 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
19974 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
19975 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
19976 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
19977 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
19978 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
19979 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
19980 ; GFX8-NEXT: v_min_f32_e32 v28, v33, v28
19981 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
19982 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
19983 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19984 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19985 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19986 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
19987 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
19988 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
19989 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
19990 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
19991 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
19992 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
19993 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
19994 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
19995 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
19996 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
19997 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
19998 ; GFX8-NEXT: v_min_f32_e32 v27, v33, v27
19999 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
20000 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
20001 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
20002 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
20003 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20004 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
20005 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
20006 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
20007 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
20008 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
20009 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
20010 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
20011 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
20012 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
20013 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
20014 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
20015 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
20016 ; GFX8-NEXT: v_min_f32_e32 v26, v33, v26
20017 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
20018 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
20019 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20020 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20021 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20022 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
20023 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
20024 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
20025 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
20026 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
20027 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
20028 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
20029 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
20030 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
20031 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
20032 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
20033 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
20034 ; GFX8-NEXT: v_min_f32_e32 v25, v33, v25
20035 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
20036 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
20037 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
20038 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20039 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20040 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
20041 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
20042 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
20043 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
20044 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
20045 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
20046 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
20047 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
20048 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
20049 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
20050 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
20051 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
20052 ; GFX8-NEXT: v_min_f32_e32 v24, v33, v24
20053 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
20054 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
20055 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20056 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20057 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20058 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
20059 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
20060 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
20061 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
20062 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
20063 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
20064 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
20065 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
20066 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
20067 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
20068 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
20069 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
20070 ; GFX8-NEXT: v_min_f32_e32 v23, v33, v23
20071 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
20072 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
20073 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20074 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20075 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20076 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
20077 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
20078 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
20079 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
20080 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
20081 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
20082 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
20083 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
20084 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
20085 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
20086 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
20087 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
20088 ; GFX8-NEXT: v_min_f32_e32 v22, v33, v22
20089 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
20090 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
20091 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20092 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20093 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20094 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v21
20095 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
20096 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
20097 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
20098 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
20099 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
20100 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
20101 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
20102 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
20103 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
20104 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
20105 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
20106 ; GFX8-NEXT: v_min_f32_e32 v21, v33, v21
20107 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
20108 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
20109 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20110 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20111 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20112 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
20113 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
20114 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
20115 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
20116 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
20117 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
20118 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
20119 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
20120 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
20121 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
20122 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
20123 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
20124 ; GFX8-NEXT: v_min_f32_e32 v20, v33, v20
20125 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
20126 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
20127 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20128 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20129 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20130 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
20131 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
20132 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
20133 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
20134 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
20135 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
20136 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
20137 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
20138 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
20139 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
20140 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
20141 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
20142 ; GFX8-NEXT: v_min_f32_e32 v19, v33, v19
20143 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
20144 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
20145 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20146 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
20147 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20148 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
20149 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
20150 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
20151 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
20152 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
20153 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
20154 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
20155 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
20156 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
20157 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
20158 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
20159 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
20160 ; GFX8-NEXT: v_min_f32_e32 v18, v33, v18
20161 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
20162 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
20163 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20164 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20165 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20166 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
20167 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
20168 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
20169 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
20170 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
20171 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
20172 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
20173 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
20174 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
20175 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
20176 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
20177 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
20178 ; GFX8-NEXT: v_min_f32_e32 v17, v33, v17
20179 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
20180 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
20181 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20182 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20183 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20184 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
20185 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
20186 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
20187 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
20188 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
20189 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
20190 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
20191 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
20192 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
20193 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
20194 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
20195 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
20196 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
20197 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
20198 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
20199 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
20200 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
20201 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
20202 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
20203 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
20204 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
20205 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
20206 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
20207 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
20208 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
20209 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
20210 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
20211 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
20212 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
20213 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
20214 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
20215 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
20216 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
20217 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
20218 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
20219 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
20220 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
20221 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
20222 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
20223 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
20224 ; GFX8-NEXT: s_setpc_b64 s[30:31]
20226 ; GFX9-LABEL: v_minnum_v32bf16:
20228 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20229 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
20230 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
20231 ; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
20232 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
20233 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
20234 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
20235 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
20236 ; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
20237 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
20238 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
20239 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
20240 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
20241 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
20242 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
20243 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
20244 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
20245 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
20246 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
20247 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
20248 ; GFX9-NEXT: v_min_f32_e32 v30, v32, v30
20249 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
20250 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
20251 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
20252 ; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
20253 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
20254 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
20255 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
20256 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
20257 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
20258 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
20259 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
20260 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
20261 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
20262 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
20263 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
20264 ; GFX9-NEXT: v_min_f32_e32 v32, v32, v29
20265 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
20266 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
20267 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
20268 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
20269 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
20270 ; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
20271 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
20272 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
20273 ; GFX9-NEXT: s_waitcnt vmcnt(0)
20274 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
20275 ; GFX9-NEXT: v_min_f32_e32 v33, v33, v34
20276 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
20277 ; GFX9-NEXT: v_min_f32_e32 v29, v15, v29
20278 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
20279 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
20280 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
20281 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
20282 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
20283 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
20284 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
20285 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
20286 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
20287 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
20288 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
20289 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
20290 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
20291 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
20292 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
20293 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
20294 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
20295 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
20296 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
20297 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
20298 ; GFX9-NEXT: v_min_f32_e32 v28, v33, v28
20299 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
20300 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
20301 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
20302 ; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
20303 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
20304 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
20305 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
20306 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
20307 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
20308 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
20309 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
20310 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
20311 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
20312 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
20313 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
20314 ; GFX9-NEXT: v_min_f32_e32 v27, v33, v27
20315 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
20316 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
20317 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
20318 ; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
20319 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
20320 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
20321 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
20322 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
20323 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
20324 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
20325 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
20326 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
20327 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
20328 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
20329 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
20330 ; GFX9-NEXT: v_min_f32_e32 v26, v33, v26
20331 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20332 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20333 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
20334 ; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
20335 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
20336 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
20337 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
20338 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
20339 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
20340 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
20341 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
20342 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
20343 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
20344 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
20345 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
20346 ; GFX9-NEXT: v_min_f32_e32 v25, v33, v25
20347 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
20348 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20349 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
20350 ; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
20351 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
20352 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
20353 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
20354 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
20355 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
20356 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
20357 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
20358 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
20359 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
20360 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
20361 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
20362 ; GFX9-NEXT: v_min_f32_e32 v24, v33, v24
20363 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20364 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20365 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
20366 ; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
20367 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
20368 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
20369 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
20370 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
20371 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
20372 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
20373 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
20374 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
20375 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
20376 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
20377 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
20378 ; GFX9-NEXT: v_min_f32_e32 v23, v33, v23
20379 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20380 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20381 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
20382 ; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
20383 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
20384 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
20385 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
20386 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
20387 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
20388 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
20389 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
20390 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
20391 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
20392 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
20393 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
20394 ; GFX9-NEXT: v_min_f32_e32 v22, v33, v22
20395 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20396 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20397 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
20398 ; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
20399 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
20400 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
20401 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
20402 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
20403 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
20404 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
20405 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
20406 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
20407 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
20408 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
20409 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
20410 ; GFX9-NEXT: v_min_f32_e32 v21, v33, v21
20411 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20412 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20413 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
20414 ; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
20415 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
20416 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
20417 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
20418 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
20419 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
20420 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
20421 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
20422 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
20423 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
20424 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
20425 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
20426 ; GFX9-NEXT: v_min_f32_e32 v20, v33, v20
20427 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20428 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20429 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
20430 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
20431 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
20432 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
20433 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
20434 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
20435 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
20436 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
20437 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
20438 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
20439 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
20440 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
20441 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
20442 ; GFX9-NEXT: v_min_f32_e32 v19, v33, v19
20443 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20444 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
20445 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
20446 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
20447 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
20448 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
20449 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
20450 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
20451 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
20452 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
20453 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
20454 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
20455 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
20456 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
20457 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
20458 ; GFX9-NEXT: v_min_f32_e32 v18, v33, v18
20459 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20460 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20461 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
20462 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
20463 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
20464 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
20465 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
20466 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
20467 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
20468 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
20469 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
20470 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
20471 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
20472 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
20473 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
20474 ; GFX9-NEXT: v_min_f32_e32 v17, v33, v17
20475 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20476 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20477 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
20478 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
20479 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
20480 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
20481 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
20482 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
20483 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
20484 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
20485 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
20486 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
20487 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
20488 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
20489 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
20490 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
20491 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
20492 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
20493 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
20494 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
20495 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
20496 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
20497 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
20498 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
20499 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
20500 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
20501 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
20502 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
20503 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
20504 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
20505 ; GFX9-NEXT: s_setpc_b64 s[30:31]
20507 ; GFX10-LABEL: v_minnum_v32bf16:
20509 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20510 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
20511 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
20512 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
20513 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
20514 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
20515 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
20516 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
20517 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
20518 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
20519 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
20520 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
20521 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
20522 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
20523 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
20524 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
20525 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
20526 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
20527 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
20528 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
20529 ; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
20530 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
20531 ; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
20532 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
20533 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20534 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20535 ; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
20536 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
20537 ; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
20538 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
20539 ; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
20540 ; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
20541 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
20542 ; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
20543 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
20544 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
20545 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20546 ; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
20547 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
20548 ; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
20549 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
20550 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20551 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20552 ; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
20553 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
20554 ; GFX10-NEXT: v_min_f32_e32 v27, v50, v27
20555 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
20556 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20557 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20558 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
20559 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
20560 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
20561 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
20562 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20563 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20564 ; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
20565 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
20566 ; GFX10-NEXT: v_min_f32_e32 v29, v38, v29
20567 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
20568 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20569 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
20570 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
20571 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
20572 ; GFX10-NEXT: v_min_f32_e32 v28, v48, v28
20573 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
20574 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20575 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20576 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
20577 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
20578 ; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
20579 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
20580 ; GFX10-NEXT: v_min_f32_e32 v34, v34, v51
20581 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
20582 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20583 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20584 ; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
20585 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
20586 ; GFX10-NEXT: v_min_f32_e32 v30, v36, v30
20587 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
20588 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20589 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20590 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
20591 ; GFX10-NEXT: v_min_f32_e32 v18, v48, v23
20592 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
20593 ; GFX10-NEXT: v_min_f32_e32 v17, v50, v22
20594 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
20595 ; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
20596 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
20597 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
20598 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20599 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20600 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
20601 ; GFX10-NEXT: v_min_f32_e32 v20, v36, v25
20602 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
20603 ; GFX10-NEXT: v_min_f32_e32 v19, v38, v24
20604 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
20605 ; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
20606 ; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
20607 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
20608 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
20609 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
20610 ; GFX10-NEXT: v_min_f32_e32 v21, v51, v26
20611 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
20612 ; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
20613 ; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
20614 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
20615 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
20616 ; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
20617 ; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
20618 ; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
20619 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
20620 ; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
20621 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
20622 ; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
20623 ; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
20624 ; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
20625 ; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
20626 ; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
20627 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
20628 ; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
20629 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
20630 ; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
20631 ; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
20632 ; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
20633 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
20634 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
20635 ; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
20636 ; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
20637 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
20638 ; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
20639 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
20640 ; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
20641 ; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
20642 ; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
20643 ; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
20644 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
20645 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
20646 ; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
20647 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
20648 ; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
20649 ; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
20650 ; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
20651 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
20652 ; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
20653 ; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
20654 ; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
20655 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
20656 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
20657 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
20658 ; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
20659 ; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
20660 ; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
20661 ; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
20662 ; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
20663 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
20664 ; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
20665 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
20666 ; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
20667 ; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
20668 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
20669 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
20670 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
20671 ; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
20672 ; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
20673 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
20674 ; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
20675 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
20676 ; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
20677 ; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
20678 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
20679 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
20680 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
20681 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
20682 ; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
20683 ; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
20684 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
20685 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
20686 ; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
20687 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
20688 ; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
20689 ; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
20690 ; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
20691 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
20692 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
20693 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
20694 ; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
20695 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
20696 ; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
20697 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
20698 ; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
20699 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
20700 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
20701 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
20702 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
20703 ; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
20704 ; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
20705 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
20706 ; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
20707 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
20708 ; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
20709 ; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
20710 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
20711 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
20712 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
20713 ; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
20714 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
20715 ; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
20716 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
20717 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
20718 ; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
20719 ; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
20720 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
20721 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
20722 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
20723 ; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
20724 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
20725 ; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
20726 ; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
20727 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
20728 ; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
20729 ; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
20730 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
20731 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
20732 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
20733 ; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
20734 ; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
20735 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
20736 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
20737 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
20738 ; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
20739 ; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
20740 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
20741 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
20742 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
20743 ; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
20744 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
20745 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
20746 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
20747 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
20748 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
20749 ; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
20750 ; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
20751 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
20752 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
20753 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
20754 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
20755 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
20756 ; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
20757 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
20758 ; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
20759 ; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
20760 ; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
20761 ; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
20762 ; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
20763 ; GFX10-NEXT: s_waitcnt vmcnt(0)
20764 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
20765 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
20766 ; GFX10-NEXT: v_min_f32_e32 v17, v31, v17
20767 ; GFX10-NEXT: v_min_f32_e32 v15, v15, v18
20768 ; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
20769 ; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
20770 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
20771 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
20772 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
20773 ; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
20774 ; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
20775 ; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
20776 ; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
20777 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
20778 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
20779 ; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
20780 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
20781 ; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
20782 ; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
20783 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
20784 ; GFX10-NEXT: s_setpc_b64 s[30:31]
20786 ; GFX11-LABEL: v_minnum_v32bf16:
20788 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20789 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
20790 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
20791 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
20792 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20793 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20794 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
20795 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
20796 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20797 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20798 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
20799 ; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
20800 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
20801 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20802 ; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
20803 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
20804 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
20805 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20806 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
20807 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
20808 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
20809 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
20810 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
20811 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
20812 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20813 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
20814 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
20815 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
20816 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20817 ; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
20818 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
20819 ; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
20820 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20821 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
20822 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
20823 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20824 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
20825 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
20826 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
20827 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
20828 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
20829 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
20830 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
20831 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20832 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20833 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20834 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
20835 ; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
20836 ; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
20837 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
20838 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
20839 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20840 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
20841 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
20842 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20843 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20844 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
20845 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
20846 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20847 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
20848 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20849 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
20850 ; GFX11-NEXT: v_min_f32_e32 v2, v2, v18
20851 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v16
20852 ; GFX11-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
20853 ; GFX11-NEXT: v_min_f32_e32 v7, v7, v23
20854 ; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
20855 ; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
20856 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
20857 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
20858 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
20859 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
20860 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
20861 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
20862 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
20863 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
20864 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20865 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
20866 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20867 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
20868 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
20869 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
20870 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20871 ; GFX11-NEXT: v_min_f32_e32 v4, v4, v20
20872 ; GFX11-NEXT: v_min_f32_e32 v20, v80, v71
20873 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
20874 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
20875 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
20876 ; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
20877 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
20878 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
20879 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20880 ; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
20881 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
20882 ; GFX11-NEXT: v_min_f32_e32 v26, v52, v51
20883 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20884 ; GFX11-NEXT: v_min_f32_e32 v6, v6, v22
20885 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
20886 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
20887 ; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
20888 ; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
20889 ; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
20890 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
20891 ; GFX11-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
20892 ; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
20893 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20894 ; GFX11-NEXT: v_min_f32_e32 v29, v38, v37
20895 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
20896 ; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
20897 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
20898 ; GFX11-NEXT: v_min_f32_e32 v14, v14, v30
20899 ; GFX11-NEXT: v_min_f32_e32 v28, v48, v39
20900 ; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
20901 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
20902 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
20903 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
20904 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20905 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
20906 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
20907 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
20908 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
20909 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
20910 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
20911 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
20912 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
20913 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
20914 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
20915 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
20916 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
20917 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
20918 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
20919 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
20920 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
20921 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
20922 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
20923 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
20924 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
20925 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
20926 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
20927 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
20928 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
20929 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
20930 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
20931 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
20932 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
20933 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
20934 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
20935 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
20936 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
20937 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
20938 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
20939 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
20940 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
20941 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
20942 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
20943 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
20944 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
20945 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
20946 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
20947 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
20948 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
20949 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
20950 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
20951 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
20952 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
20953 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
20954 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
20955 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
20956 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
20957 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
20958 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
20959 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
20960 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
20961 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
20962 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
20963 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
20964 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
20965 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
20966 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
20967 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
20968 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
20969 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
20970 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
20971 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
20972 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
20973 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
20974 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
20975 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
20976 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
20977 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
20978 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
20979 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
20980 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
20981 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
20982 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
20983 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
20984 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
20985 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
20986 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
20987 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
20988 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
20989 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
20990 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
20991 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
20992 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
20993 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
20994 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
20995 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
20996 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
20997 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
20998 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
20999 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
21000 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
21001 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
21002 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
21003 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
21004 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
21005 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21006 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
21007 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
21008 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
21009 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
21010 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
21011 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
21012 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
21013 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
21014 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
21015 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
21016 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21017 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
21018 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
21019 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21020 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
21021 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
21022 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
21023 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
21024 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
21025 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
21026 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
21027 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
21028 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21029 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
21030 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
21031 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21032 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
21033 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
21034 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21035 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
21036 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
21037 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
21038 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
21039 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
21040 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
21041 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
21042 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
21043 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
21044 ; GFX11-NEXT: s_waitcnt vmcnt(0)
21045 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
21046 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21047 ; GFX11-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
21048 ; GFX11-NEXT: v_min_f32_e32 v15, v15, v18
21049 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
21050 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
21051 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
21052 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
21053 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
21054 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
21055 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
21056 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
21057 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
21058 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
21059 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
21060 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
21061 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21062 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
21063 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21064 %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
21065 ret <32 x bfloat> %op
21069 declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
21070 declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
21071 declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
21072 declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
21073 declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
21074 declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
21075 declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
21077 define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
21078 ; GCN-LABEL: v_maxnum_bf16:
21080 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21081 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21082 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21083 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21084 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21085 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1
21086 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21087 ; GCN-NEXT: s_setpc_b64 s[30:31]
21089 ; GFX7-LABEL: v_maxnum_bf16:
21091 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21092 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21093 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21094 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21095 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21096 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
21097 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21098 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21100 ; GFX8-LABEL: v_maxnum_bf16:
21102 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21103 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21104 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21105 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
21106 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
21107 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
21108 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
21109 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
21110 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21111 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
21112 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21113 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21115 ; GFX9-LABEL: v_maxnum_bf16:
21117 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21118 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21119 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21120 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
21121 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
21122 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21123 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
21124 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
21125 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21126 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
21127 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21128 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21130 ; GFX10-LABEL: v_maxnum_bf16:
21132 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21133 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21134 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21135 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
21136 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
21137 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
21138 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21139 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
21140 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
21141 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21142 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21144 ; GFX11-LABEL: v_maxnum_bf16:
21146 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21147 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21148 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21149 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21150 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
21151 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
21152 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
21153 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21154 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
21155 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
21156 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
21157 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21158 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21159 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21160 %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
21164 define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
21165 ; GCN-LABEL: v_maxnum_v2bf16:
21167 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21168 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21169 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21170 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21171 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21172 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21173 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21174 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21175 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21176 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3
21177 ; GCN-NEXT: v_max_f32_e32 v0, v0, v2
21178 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21179 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21180 ; GCN-NEXT: s_setpc_b64 s[30:31]
21182 ; GFX7-LABEL: v_maxnum_v2bf16:
21184 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21185 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21186 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21187 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21188 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21189 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21190 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21191 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21192 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21193 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
21194 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
21195 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21196 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21197 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21199 ; GFX8-LABEL: v_maxnum_v2bf16:
21201 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21202 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21203 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21204 ; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
21205 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
21206 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
21207 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21208 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21209 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
21210 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
21211 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
21212 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21213 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
21214 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
21215 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
21216 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
21217 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
21218 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21219 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
21220 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21221 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
21222 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21224 ; GFX9-LABEL: v_maxnum_v2bf16:
21226 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21227 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21228 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21229 ; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
21230 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21231 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21232 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
21233 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21234 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
21235 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
21236 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
21237 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21238 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
21239 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
21240 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
21241 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
21242 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21243 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
21244 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
21245 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
21246 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21248 ; GFX10-LABEL: v_maxnum_v2bf16:
21250 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21251 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21252 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21253 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21254 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21255 ; GFX10-NEXT: v_max_f32_e32 v2, v3, v2
21256 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
21257 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
21258 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
21259 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
21260 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
21261 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
21262 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
21263 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
21264 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
21265 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21266 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
21267 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
21268 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21270 ; GFX11-LABEL: v_maxnum_v2bf16:
21272 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21273 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21274 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21275 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21276 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21277 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
21278 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
21279 ; GFX11-NEXT: v_max_f32_e32 v2, v3, v2
21280 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
21281 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
21282 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
21283 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
21284 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
21285 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
21286 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
21287 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
21288 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
21289 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
21290 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21291 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
21292 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21293 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
21294 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21295 %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
21296 ret <2 x bfloat> %op
21299 define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
21300 ; GCN-LABEL: v_maxnum_v3bf16:
21302 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21303 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21304 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21305 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21306 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
21307 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21308 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
21309 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21310 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21311 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21312 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21313 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21314 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21315 ; GCN-NEXT: v_max_f32_e32 v2, v2, v5
21316 ; GCN-NEXT: v_max_f32_e32 v1, v1, v4
21317 ; GCN-NEXT: v_max_f32_e32 v0, v0, v3
21318 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21319 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21320 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21321 ; GCN-NEXT: s_setpc_b64 s[30:31]
21323 ; GFX7-LABEL: v_maxnum_v3bf16:
21325 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21326 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21327 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21328 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21329 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
21330 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21331 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
21332 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21333 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21334 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21335 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21336 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21337 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21338 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
21339 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
21340 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
21341 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21342 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21343 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21344 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21346 ; GFX8-LABEL: v_maxnum_v3bf16:
21348 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21349 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21350 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21351 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
21352 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
21353 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
21354 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
21355 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
21356 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21357 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
21358 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21359 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
21360 ; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
21361 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
21362 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
21363 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
21364 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21365 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21366 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
21367 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
21368 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
21369 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21370 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
21371 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
21372 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
21373 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
21374 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
21375 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21376 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
21377 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21378 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
21379 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
21380 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21382 ; GFX9-LABEL: v_maxnum_v3bf16:
21384 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21385 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21386 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21387 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
21388 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
21389 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21390 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
21391 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
21392 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21393 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
21394 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21395 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
21396 ; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
21397 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21398 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21399 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
21400 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
21401 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
21402 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
21403 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21404 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
21405 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
21406 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
21407 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
21408 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21409 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
21410 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
21411 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
21412 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
21413 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21415 ; GFX10-LABEL: v_maxnum_v3bf16:
21417 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21418 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
21419 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21420 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21421 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21422 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21423 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21424 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
21425 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
21426 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
21427 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
21428 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
21429 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
21430 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21431 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
21432 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
21433 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
21434 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
21435 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
21436 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
21437 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21438 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21439 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21440 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21441 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
21442 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21443 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
21444 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21446 ; GFX11TRUE16-LABEL: v_maxnum_v3bf16:
21447 ; GFX11TRUE16: ; %bb.0:
21448 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21449 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
21450 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21451 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21452 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21453 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21454 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
21455 ; GFX11TRUE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
21456 ; GFX11TRUE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
21457 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
21458 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
21459 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
21460 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21461 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
21462 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
21463 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
21464 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
21465 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
21466 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
21467 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
21468 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21469 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21470 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21471 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21472 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21473 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
21474 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21475 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
21476 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
21477 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
21479 ; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
21480 ; GFX11FAKE16: ; %bb.0:
21481 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21482 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
21483 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21484 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21485 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21486 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21487 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
21488 ; GFX11FAKE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
21489 ; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
21490 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
21491 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
21492 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
21493 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21494 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
21495 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
21496 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
21497 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
21498 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
21499 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
21500 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
21501 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21502 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21503 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21504 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21505 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21506 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
21507 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21508 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
21509 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
21510 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
21511 %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
21512 ret <3 x bfloat> %op
21515 define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
21516 ; GCN-LABEL: v_maxnum_v4bf16:
21518 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21519 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21520 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
21521 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21522 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
21523 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21524 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
21525 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21526 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
21527 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21528 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21529 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21530 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21531 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21532 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21533 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21534 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21535 ; GCN-NEXT: v_max_f32_e32 v3, v3, v7
21536 ; GCN-NEXT: v_max_f32_e32 v2, v2, v6
21537 ; GCN-NEXT: v_max_f32_e32 v1, v1, v5
21538 ; GCN-NEXT: v_max_f32_e32 v0, v0, v4
21539 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21540 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21541 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21542 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21543 ; GCN-NEXT: s_setpc_b64 s[30:31]
21545 ; GFX7-LABEL: v_maxnum_v4bf16:
21547 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21548 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21549 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
21550 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21551 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
21552 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21553 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
21554 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21555 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
21556 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21557 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21558 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21559 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21560 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21561 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21562 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21563 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21564 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
21565 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
21566 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
21567 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
21568 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21569 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21570 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21571 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21572 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21574 ; GFX8-LABEL: v_maxnum_v4bf16:
21576 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21577 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21578 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21579 ; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
21580 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
21581 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
21582 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21583 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21584 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
21585 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
21586 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
21587 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
21588 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
21589 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
21590 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
21591 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
21592 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
21593 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
21594 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21595 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
21596 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21597 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21598 ; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
21599 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
21600 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
21601 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21602 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21603 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
21604 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
21605 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
21606 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21607 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
21608 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
21609 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
21610 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
21611 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
21612 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21613 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
21614 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
21615 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21616 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
21617 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
21618 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21620 ; GFX9-LABEL: v_maxnum_v4bf16:
21622 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21623 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21624 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21625 ; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
21626 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21627 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21628 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
21629 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21630 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
21631 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
21632 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
21633 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
21634 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
21635 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
21636 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
21637 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
21638 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21639 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
21640 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21641 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21642 ; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
21643 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21644 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21645 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
21646 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
21647 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
21648 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
21649 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21650 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
21651 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
21652 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
21653 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
21654 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21655 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
21656 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
21657 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
21658 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
21659 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21661 ; GFX10-LABEL: v_maxnum_v4bf16:
21663 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21664 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21665 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21666 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21667 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21668 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
21669 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
21670 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
21671 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21672 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21673 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
21674 ; GFX10-NEXT: v_max_f32_e32 v3, v7, v6
21675 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
21676 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
21677 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
21678 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21679 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
21680 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
21681 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
21682 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
21683 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
21684 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
21685 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
21686 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
21687 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
21688 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
21689 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
21690 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
21691 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
21692 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21693 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
21694 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21695 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
21696 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
21697 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
21698 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21700 ; GFX11-LABEL: v_maxnum_v4bf16:
21702 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21703 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
21704 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
21705 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21706 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21707 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21708 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21709 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21710 ; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
21711 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21712 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
21713 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21714 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
21715 ; GFX11-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
21716 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
21717 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
21718 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
21719 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
21720 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
21721 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21722 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
21723 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
21724 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
21725 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
21726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
21727 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
21728 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
21729 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
21730 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
21731 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
21732 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
21733 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21734 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21735 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
21736 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21737 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
21738 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
21739 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21740 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
21741 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21742 %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
21743 ret <4 x bfloat> %op
21746 define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
21747 ; GCN-LABEL: v_maxnum_v8bf16:
21749 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21750 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21751 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
21752 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21753 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
21754 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21755 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
21756 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21757 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
21758 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
21759 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
21760 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
21761 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
21762 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
21763 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
21764 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
21765 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
21766 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
21767 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21768 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
21769 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21770 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
21771 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21772 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
21773 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21774 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
21775 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21776 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
21777 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21778 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
21779 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21780 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
21781 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21782 ; GCN-NEXT: v_max_f32_e32 v7, v7, v15
21783 ; GCN-NEXT: v_max_f32_e32 v6, v6, v14
21784 ; GCN-NEXT: v_max_f32_e32 v5, v5, v13
21785 ; GCN-NEXT: v_max_f32_e32 v4, v4, v12
21786 ; GCN-NEXT: v_max_f32_e32 v3, v3, v11
21787 ; GCN-NEXT: v_max_f32_e32 v2, v2, v10
21788 ; GCN-NEXT: v_max_f32_e32 v1, v1, v9
21789 ; GCN-NEXT: v_max_f32_e32 v0, v0, v8
21790 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21791 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21792 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21793 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21794 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21795 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21796 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21797 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21798 ; GCN-NEXT: s_setpc_b64 s[30:31]
21800 ; GFX7-LABEL: v_maxnum_v8bf16:
21802 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21803 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21804 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
21805 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21806 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
21807 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21808 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
21809 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21810 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
21811 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
21812 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
21813 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
21814 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
21815 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
21816 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
21817 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
21818 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
21819 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
21820 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21821 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
21822 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21823 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
21824 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21825 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
21826 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21827 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
21828 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21829 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
21830 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21831 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
21832 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21833 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
21834 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21835 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
21836 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
21837 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
21838 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v12
21839 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v11
21840 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v10
21841 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
21842 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
21843 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21844 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21845 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21846 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21847 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21848 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21849 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21850 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21851 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21853 ; GFX8-LABEL: v_maxnum_v8bf16:
21855 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21856 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
21857 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
21858 ; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
21859 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
21860 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
21861 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21862 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21863 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
21864 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
21865 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
21866 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
21867 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
21868 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
21869 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
21870 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
21871 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
21872 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
21873 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21874 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
21875 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
21876 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
21877 ; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
21878 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
21879 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
21880 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21881 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21882 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
21883 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v6
21884 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
21885 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
21886 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
21887 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
21888 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
21889 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
21890 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
21891 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21892 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
21893 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
21894 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
21895 ; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
21896 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
21897 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
21898 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21899 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21900 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
21901 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v5
21902 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
21903 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
21904 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
21905 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
21906 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
21907 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
21908 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
21909 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21910 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
21911 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
21912 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
21913 ; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
21914 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
21915 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
21916 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21917 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21918 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
21919 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v4
21920 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
21921 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
21922 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
21923 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
21924 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
21925 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
21926 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
21927 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21928 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
21929 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
21930 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
21931 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
21932 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21933 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
21934 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
21935 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
21936 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
21937 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21939 ; GFX9-LABEL: v_maxnum_v8bf16:
21941 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21942 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
21943 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
21944 ; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
21945 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21946 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21947 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
21948 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21949 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
21950 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
21951 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
21952 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
21953 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
21954 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
21955 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
21956 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
21957 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21958 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
21959 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
21960 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
21961 ; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
21962 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21963 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21964 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
21965 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
21966 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
21967 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
21968 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
21969 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
21970 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
21971 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
21972 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
21973 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21974 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
21975 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
21976 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
21977 ; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
21978 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21979 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21980 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
21981 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
21982 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
21983 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
21984 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
21985 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
21986 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
21987 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
21988 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
21989 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21990 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
21991 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
21992 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
21993 ; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
21994 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21995 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21996 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
21997 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
21998 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
21999 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
22000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
22001 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
22002 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
22003 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
22004 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
22005 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
22006 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
22007 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
22008 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
22009 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
22010 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
22011 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
22012 ; GFX9-NEXT: s_setpc_b64 s[30:31]
22014 ; GFX10-LABEL: v_maxnum_v8bf16:
22016 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22017 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
22018 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
22019 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22020 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22021 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
22022 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22023 ; GFX10-NEXT: v_max_f32_e32 v8, v9, v8
22024 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
22025 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22026 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
22027 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
22028 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
22029 ; GFX10-NEXT: v_max_f32_e32 v7, v10, v9
22030 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
22031 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
22032 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
22033 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
22034 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
22035 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
22036 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
22037 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
22038 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
22039 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
22040 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
22041 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
22042 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
22043 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22044 ; GFX10-NEXT: v_max_f32_e32 v6, v10, v6
22045 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
22046 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22047 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22048 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
22049 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
22050 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
22051 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
22052 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22053 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22054 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22055 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v5
22056 ; GFX10-NEXT: v_max_f32_e32 v5, v15, v13
22057 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
22058 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
22059 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
22060 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
22061 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
22062 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
22063 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
22064 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22065 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
22066 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
22067 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
22068 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
22069 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
22070 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
22071 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22072 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
22073 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
22074 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
22075 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
22076 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22077 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
22078 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22079 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
22080 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
22081 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22082 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
22083 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
22084 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
22085 ; GFX10-NEXT: s_setpc_b64 s[30:31]
22087 ; GFX11-LABEL: v_maxnum_v8bf16:
22089 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22090 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
22091 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
22092 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22093 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
22094 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
22095 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
22096 ; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
22097 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
22098 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
22099 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22100 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
22101 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22102 ; GFX11-NEXT: v_max_f32_e32 v3, v3, v7
22103 ; GFX11-NEXT: v_max_f32_e32 v7, v10, v9
22104 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
22105 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
22106 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22107 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
22108 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
22109 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
22110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
22111 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
22112 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22113 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
22114 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
22115 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
22116 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22117 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
22118 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22119 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
22120 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
22121 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
22122 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22123 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
22124 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22125 ; GFX11-NEXT: v_max_f32_e32 v6, v10, v6
22126 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
22127 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22128 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
22129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
22130 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
22131 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
22132 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
22133 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22134 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
22135 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22136 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22137 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
22138 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
22139 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v4
22140 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
22141 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
22142 ; GFX11-NEXT: v_max_f32_e32 v5, v15, v13
22143 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22144 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
22145 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
22146 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
22147 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
22148 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22149 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
22150 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
22151 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22152 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
22153 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
22154 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
22155 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
22156 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
22157 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22158 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
22159 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22160 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22161 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
22162 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
22163 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22164 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
22165 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
22166 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22167 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
22168 ; GFX11-NEXT: s_setpc_b64 s[30:31]
22169 %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
22170 ret <8 x bfloat> %op
22173 define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
22174 ; GCN-LABEL: v_maxnum_v16bf16:
22176 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22177 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
22178 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
22179 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
22180 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22181 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30
22182 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
22183 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
22184 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
22185 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22186 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29
22187 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
22188 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
22189 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
22190 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22191 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28
22192 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
22193 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
22194 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
22195 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
22196 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
22197 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
22198 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
22199 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
22200 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
22201 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
22202 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
22203 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
22204 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
22205 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
22206 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
22207 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
22208 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
22209 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
22210 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
22211 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
22212 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
22213 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
22214 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
22215 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
22216 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
22217 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
22218 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22219 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27
22220 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
22221 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
22222 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22223 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
22224 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22225 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
22226 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22227 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
22228 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22229 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
22230 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22231 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
22232 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22233 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
22234 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22235 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
22236 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22237 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
22238 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22239 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
22240 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22241 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
22242 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22243 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22244 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26
22245 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25
22246 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24
22247 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23
22248 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22
22249 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21
22250 ; GCN-NEXT: v_max_f32_e32 v4, v4, v20
22251 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19
22252 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18
22253 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17
22254 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16
22255 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22256 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22257 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22258 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22259 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22260 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22261 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22262 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22263 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22264 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22265 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22266 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22267 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22268 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22269 ; GCN-NEXT: s_waitcnt vmcnt(0)
22270 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
22271 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
22272 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16
22273 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22274 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22275 ; GCN-NEXT: s_setpc_b64 s[30:31]
22277 ; GFX7-LABEL: v_maxnum_v16bf16:
22279 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22280 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
22281 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
22282 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
22283 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22284 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
22285 ; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
22286 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
22287 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
22288 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
22289 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22290 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
22291 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
22292 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
22293 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
22294 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
22295 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
22296 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
22297 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
22298 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
22299 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
22300 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
22301 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
22302 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
22303 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
22304 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
22305 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
22306 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
22307 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
22308 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
22309 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
22310 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
22311 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
22312 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
22313 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
22314 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
22315 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
22316 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
22317 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
22318 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
22319 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22320 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
22321 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22322 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
22323 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22324 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
22325 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22326 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
22327 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22328 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
22329 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22330 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
22331 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22332 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22333 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
22334 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22335 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
22336 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22337 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
22338 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22339 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
22340 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22341 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
22342 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22343 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
22344 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22345 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
22346 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
22347 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
22348 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
22349 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
22350 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
22351 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
22352 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
22353 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
22354 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
22355 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
22356 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
22357 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
22358 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22359 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22360 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22361 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22362 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22363 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22364 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22365 ; GFX7-NEXT: s_waitcnt vmcnt(0)
22366 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
22367 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
22368 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
22369 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22370 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22371 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22372 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22373 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22374 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22375 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22376 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22377 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22378 ; GFX7-NEXT: s_setpc_b64 s[30:31]
22380 ; GFX8-LABEL: v_maxnum_v16bf16:
22382 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22383 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22384 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22385 ; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
22386 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
22387 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
22388 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
22389 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22390 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22391 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22392 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v15
22393 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
22394 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
22395 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
22396 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
22397 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
22398 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
22399 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
22400 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
22401 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
22402 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
22403 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
22404 ; GFX8-NEXT: v_max_f32_e32 v15, v17, v15
22405 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
22406 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
22407 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22408 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22409 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22410 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v14
22411 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
22412 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
22413 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
22414 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
22415 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
22416 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
22417 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
22418 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
22419 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
22420 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
22421 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
22422 ; GFX8-NEXT: v_max_f32_e32 v14, v17, v14
22423 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
22424 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
22425 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22426 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22427 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22428 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v13
22429 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
22430 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
22431 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
22432 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
22433 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
22434 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
22435 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
22436 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
22437 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
22438 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
22439 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
22440 ; GFX8-NEXT: v_max_f32_e32 v13, v17, v13
22441 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
22442 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
22443 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22444 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22445 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22446 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v12
22447 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
22448 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
22449 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
22450 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
22451 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
22452 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
22453 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
22454 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
22455 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
22456 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22457 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
22458 ; GFX8-NEXT: v_max_f32_e32 v12, v17, v12
22459 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
22460 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
22461 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22462 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22463 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22464 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v11
22465 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
22466 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
22467 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
22468 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
22469 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
22470 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
22471 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
22472 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
22473 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
22474 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
22475 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
22476 ; GFX8-NEXT: v_max_f32_e32 v11, v17, v11
22477 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
22478 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
22479 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22480 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22481 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22482 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v10
22483 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
22484 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
22485 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
22486 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
22487 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
22488 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
22489 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
22490 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
22491 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
22492 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
22493 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
22494 ; GFX8-NEXT: v_max_f32_e32 v10, v17, v10
22495 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
22496 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
22497 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22498 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22499 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22500 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v9
22501 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
22502 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
22503 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
22504 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
22505 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
22506 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
22507 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
22508 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
22509 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
22510 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
22511 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
22512 ; GFX8-NEXT: v_max_f32_e32 v9, v17, v9
22513 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
22514 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
22515 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22516 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22517 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22518 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v8
22519 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
22520 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
22521 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
22522 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
22523 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
22524 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
22525 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
22526 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
22527 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
22528 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
22529 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
22530 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
22531 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
22532 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
22533 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
22534 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
22535 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
22536 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
22537 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
22538 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
22539 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
22540 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
22541 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
22542 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
22543 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
22544 ; GFX8-NEXT: s_setpc_b64 s[30:31]
22546 ; GFX9-LABEL: v_maxnum_v16bf16:
22548 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22549 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22550 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22551 ; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
22552 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22553 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22554 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
22555 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
22556 ; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
22557 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
22558 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
22559 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
22560 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
22561 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
22562 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
22563 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
22564 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
22565 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
22566 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
22567 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
22568 ; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
22569 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22570 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22571 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
22572 ; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
22573 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
22574 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
22575 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
22576 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
22577 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
22578 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
22579 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
22580 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
22581 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
22582 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
22583 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
22584 ; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
22585 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22586 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22587 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
22588 ; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
22589 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
22590 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
22591 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
22592 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
22593 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
22594 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
22595 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
22596 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
22597 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
22598 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
22599 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
22600 ; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
22601 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22602 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22603 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
22604 ; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
22605 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
22606 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
22607 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
22608 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
22609 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
22610 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
22611 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
22612 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
22613 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
22614 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22615 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
22616 ; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
22617 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22618 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22619 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
22620 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
22621 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
22622 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
22623 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
22624 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
22625 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
22626 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
22627 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
22628 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
22629 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
22630 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
22631 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
22632 ; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
22633 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22634 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22635 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
22636 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
22637 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
22638 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
22639 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
22640 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
22641 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
22642 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
22643 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
22644 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
22645 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
22646 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
22647 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
22648 ; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
22649 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22650 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22651 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
22652 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
22653 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
22654 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
22655 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
22656 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
22657 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
22658 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
22659 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
22660 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
22661 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
22662 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
22663 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
22664 ; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
22665 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22666 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22667 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
22668 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
22669 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
22670 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
22671 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
22672 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
22673 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
22674 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
22675 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
22676 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
22677 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
22678 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
22679 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
22680 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
22681 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
22682 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
22683 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
22684 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
22685 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
22686 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
22687 ; GFX9-NEXT: s_setpc_b64 s[30:31]
22689 ; GFX10-LABEL: v_maxnum_v16bf16:
22691 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22692 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22693 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22694 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22695 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22696 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
22697 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22698 ; GFX10-NEXT: v_max_f32_e32 v16, v17, v16
22699 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
22700 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
22701 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22702 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
22703 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
22704 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
22705 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
22706 ; GFX10-NEXT: v_max_f32_e32 v17, v18, v17
22707 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
22708 ; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
22709 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
22710 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
22711 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
22712 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
22713 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22714 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
22715 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
22716 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
22717 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22718 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
22719 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
22720 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
22721 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22722 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22723 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
22724 ; GFX10-NEXT: v_max_f32_e32 v17, v20, v19
22725 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
22726 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v13
22727 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
22728 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
22729 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
22730 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
22731 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
22732 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22733 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
22734 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22735 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22736 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
22737 ; GFX10-NEXT: v_max_f32_e32 v13, v19, v18
22738 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
22739 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
22740 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22741 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
22742 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
22743 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
22744 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
22745 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
22746 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22747 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22748 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
22749 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
22750 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22751 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
22752 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
22753 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22754 ; GFX10-NEXT: v_max_f32_e32 v12, v18, v12
22755 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
22756 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
22757 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
22758 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
22759 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
22760 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
22761 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
22762 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
22763 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
22764 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
22765 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22766 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
22767 ; GFX10-NEXT: v_max_f32_e32 v18, v19, v18
22768 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22769 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
22770 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
22771 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
22772 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
22773 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
22774 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
22775 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22776 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
22777 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
22778 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
22779 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22780 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
22781 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
22782 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
22783 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
22784 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22785 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
22786 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
22787 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
22788 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
22789 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
22790 ; GFX10-NEXT: v_max_f32_e32 v19, v22, v20
22791 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
22792 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
22793 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22794 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22795 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
22796 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
22797 ; GFX10-NEXT: v_max_f32_e32 v9, v22, v20
22798 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
22799 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v8
22800 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
22801 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
22802 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
22803 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
22804 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
22805 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
22806 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
22807 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
22808 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
22809 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22810 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
22811 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
22812 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
22813 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
22814 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
22815 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
22816 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
22817 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
22818 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
22819 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22820 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
22821 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22822 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
22823 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
22824 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
22825 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
22826 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
22827 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
22828 ; GFX10-NEXT: s_setpc_b64 s[30:31]
22830 ; GFX11-LABEL: v_maxnum_v16bf16:
22832 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22833 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
22834 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22835 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22836 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
22838 ; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
22839 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
22840 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22841 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
22842 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
22843 ; GFX11-NEXT: v_max_f32_e32 v17, v18, v17
22844 ; GFX11-NEXT: v_max_f32_e32 v6, v6, v14
22845 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
22846 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
22847 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
22848 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
22849 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22850 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
22851 ; GFX11-NEXT: v_max_f32_e32 v7, v7, v15
22852 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
22853 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
22854 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
22855 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22856 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
22857 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
22858 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22859 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
22860 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
22861 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
22862 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
22863 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
22864 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
22865 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22866 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
22867 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
22868 ; GFX11-NEXT: v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
22869 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
22870 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
22871 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
22872 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22873 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22874 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22875 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
22876 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22877 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
22878 ; GFX11-NEXT: v_max_f32_e32 v4, v4, v12
22879 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22880 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22881 ; GFX11-NEXT: v_max_f32_e32 v5, v5, v13
22882 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
22883 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
22884 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
22885 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
22886 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
22887 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22888 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
22889 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
22890 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
22891 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
22892 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
22893 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
22894 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22895 ; GFX11-NEXT: v_max_f32_e32 v12, v18, v12
22896 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22897 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
22898 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
22899 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
22900 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
22901 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22902 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
22903 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
22904 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
22905 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
22906 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
22907 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
22908 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
22909 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
22910 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
22911 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
22912 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
22913 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22914 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22915 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
22916 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
22917 ; GFX11-NEXT: v_max_f32_e32 v18, v19, v18
22918 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
22919 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
22920 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22921 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22922 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
22923 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
22924 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22925 ; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
22926 ; GFX11-NEXT: v_max_f32_e32 v3, v3, v11
22927 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
22928 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
22929 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
22930 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
22931 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22932 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
22933 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
22934 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22935 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
22936 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
22937 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
22938 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
22939 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
22940 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
22941 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
22942 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
22943 ; GFX11-NEXT: v_max_f32_e32 v19, v22, v20
22944 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
22945 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
22946 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22947 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
22948 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
22949 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22950 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22951 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
22952 ; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9
22953 ; GFX11-NEXT: v_max_f32_e32 v9, v22, v20
22954 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
22955 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
22956 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
22957 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
22958 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
22959 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
22960 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
22961 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
22962 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
22963 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
22964 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22965 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
22966 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
22967 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
22968 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
22969 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
22970 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
22971 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22972 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
22973 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
22974 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22975 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
22976 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22977 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22978 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
22979 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
22980 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
22981 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
22982 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
22983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22984 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
22985 ; GFX11-NEXT: s_setpc_b64 s[30:31]
22986 %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
22987 ret <16 x bfloat> %op
22990 define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
22991 ; GCN-LABEL: v_maxnum_v32bf16:
22993 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22994 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
22995 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
22996 ; GCN-NEXT: s_waitcnt vmcnt(1)
22997 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
22998 ; GCN-NEXT: s_waitcnt vmcnt(0)
22999 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
23000 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23001 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23002 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
23003 ; GCN-NEXT: v_max_f32_e32 v31, v31, v32
23004 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
23005 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23006 ; GCN-NEXT: s_waitcnt vmcnt(0)
23007 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23008 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23009 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
23010 ; GCN-NEXT: v_max_f32_e32 v30, v30, v32
23011 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
23012 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23013 ; GCN-NEXT: s_waitcnt vmcnt(0)
23014 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23015 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23016 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
23017 ; GCN-NEXT: v_max_f32_e32 v29, v29, v32
23018 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
23019 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23020 ; GCN-NEXT: s_waitcnt vmcnt(0)
23021 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23022 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23023 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
23024 ; GCN-NEXT: v_max_f32_e32 v28, v28, v32
23025 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
23026 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23027 ; GCN-NEXT: s_waitcnt vmcnt(0)
23028 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23029 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23030 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
23031 ; GCN-NEXT: v_max_f32_e32 v27, v27, v32
23032 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
23033 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23034 ; GCN-NEXT: s_waitcnt vmcnt(0)
23035 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23036 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23037 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
23038 ; GCN-NEXT: v_max_f32_e32 v26, v26, v32
23039 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
23040 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23041 ; GCN-NEXT: s_waitcnt vmcnt(0)
23042 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23043 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23044 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
23045 ; GCN-NEXT: v_max_f32_e32 v25, v25, v32
23046 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
23047 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23048 ; GCN-NEXT: s_waitcnt vmcnt(0)
23049 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23050 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23051 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
23052 ; GCN-NEXT: v_max_f32_e32 v24, v24, v32
23053 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
23054 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23055 ; GCN-NEXT: s_waitcnt vmcnt(0)
23056 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23057 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23058 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
23059 ; GCN-NEXT: v_max_f32_e32 v23, v23, v32
23060 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
23061 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23062 ; GCN-NEXT: s_waitcnt vmcnt(0)
23063 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23064 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23065 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
23066 ; GCN-NEXT: v_max_f32_e32 v22, v22, v32
23067 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
23068 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23069 ; GCN-NEXT: s_waitcnt vmcnt(0)
23070 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23071 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23072 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
23073 ; GCN-NEXT: v_max_f32_e32 v21, v21, v32
23074 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
23075 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23076 ; GCN-NEXT: s_waitcnt vmcnt(0)
23077 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23078 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23079 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
23080 ; GCN-NEXT: v_max_f32_e32 v20, v20, v32
23081 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
23082 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23083 ; GCN-NEXT: s_waitcnt vmcnt(0)
23084 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23085 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23086 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
23087 ; GCN-NEXT: v_max_f32_e32 v19, v19, v32
23088 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
23089 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23090 ; GCN-NEXT: s_waitcnt vmcnt(0)
23091 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23092 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23093 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
23094 ; GCN-NEXT: v_max_f32_e32 v18, v18, v32
23095 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
23096 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23097 ; GCN-NEXT: s_waitcnt vmcnt(0)
23098 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23099 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23100 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
23101 ; GCN-NEXT: v_max_f32_e32 v17, v17, v32
23102 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
23103 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23104 ; GCN-NEXT: s_waitcnt vmcnt(0)
23105 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23106 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23107 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
23108 ; GCN-NEXT: v_max_f32_e32 v16, v16, v32
23109 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
23110 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23111 ; GCN-NEXT: s_waitcnt vmcnt(0)
23112 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23113 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23114 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
23115 ; GCN-NEXT: v_max_f32_e32 v15, v15, v32
23116 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
23117 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23118 ; GCN-NEXT: s_waitcnt vmcnt(0)
23119 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23120 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23121 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
23122 ; GCN-NEXT: v_max_f32_e32 v14, v14, v32
23123 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
23124 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23125 ; GCN-NEXT: s_waitcnt vmcnt(0)
23126 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23127 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23128 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
23129 ; GCN-NEXT: v_max_f32_e32 v13, v13, v32
23130 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
23131 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23132 ; GCN-NEXT: s_waitcnt vmcnt(0)
23133 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23134 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23135 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
23136 ; GCN-NEXT: v_max_f32_e32 v12, v12, v32
23137 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
23138 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23139 ; GCN-NEXT: s_waitcnt vmcnt(0)
23140 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23141 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23142 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
23143 ; GCN-NEXT: v_max_f32_e32 v11, v11, v32
23144 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
23145 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23146 ; GCN-NEXT: s_waitcnt vmcnt(0)
23147 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23148 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23149 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
23150 ; GCN-NEXT: v_max_f32_e32 v10, v10, v32
23151 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
23152 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23153 ; GCN-NEXT: s_waitcnt vmcnt(0)
23154 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23155 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23156 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
23157 ; GCN-NEXT: v_max_f32_e32 v9, v9, v32
23158 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
23159 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23160 ; GCN-NEXT: s_waitcnt vmcnt(0)
23161 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23162 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23163 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
23164 ; GCN-NEXT: v_max_f32_e32 v8, v8, v32
23165 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
23166 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23167 ; GCN-NEXT: s_waitcnt vmcnt(0)
23168 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23169 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23170 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
23171 ; GCN-NEXT: v_max_f32_e32 v7, v7, v32
23172 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
23173 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23174 ; GCN-NEXT: s_waitcnt vmcnt(0)
23175 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23176 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23177 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
23178 ; GCN-NEXT: v_max_f32_e32 v6, v6, v32
23179 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
23180 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23181 ; GCN-NEXT: s_waitcnt vmcnt(0)
23182 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23183 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23184 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
23185 ; GCN-NEXT: v_max_f32_e32 v5, v5, v32
23186 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
23187 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23188 ; GCN-NEXT: s_waitcnt vmcnt(0)
23189 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23190 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23191 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
23192 ; GCN-NEXT: v_max_f32_e32 v4, v4, v32
23193 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
23194 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23195 ; GCN-NEXT: s_waitcnt vmcnt(0)
23196 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23197 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23198 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
23199 ; GCN-NEXT: v_max_f32_e32 v3, v3, v32
23200 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
23201 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23202 ; GCN-NEXT: s_waitcnt vmcnt(0)
23203 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23204 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23205 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
23206 ; GCN-NEXT: v_max_f32_e32 v2, v2, v32
23207 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
23208 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23209 ; GCN-NEXT: s_waitcnt vmcnt(0)
23210 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23211 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23212 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
23213 ; GCN-NEXT: v_max_f32_e32 v1, v1, v32
23214 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
23215 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23216 ; GCN-NEXT: s_waitcnt vmcnt(0)
23217 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23218 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23219 ; GCN-NEXT: v_max_f32_e32 v0, v0, v32
23220 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23221 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23222 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23223 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23224 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23225 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23226 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23227 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23228 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23229 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23230 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23231 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23232 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23233 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23234 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23235 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23236 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23237 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23238 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23239 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23240 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23241 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23242 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23243 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23244 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23245 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23246 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23247 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23248 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23249 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23250 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23251 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23252 ; GCN-NEXT: s_setpc_b64 s[30:31]
23254 ; GFX7-LABEL: v_maxnum_v32bf16:
23256 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23257 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
23258 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
23259 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
23260 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23261 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
23262 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23263 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
23264 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23265 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
23266 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23267 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
23268 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23269 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
23270 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23271 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
23272 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23273 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
23274 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23275 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
23276 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23277 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
23278 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23279 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
23280 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23281 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
23282 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23283 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
23284 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23285 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
23286 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23287 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
23288 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23289 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
23290 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23291 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
23292 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23293 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
23294 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23295 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
23296 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23297 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
23298 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23299 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
23300 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23301 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
23302 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23303 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
23304 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23305 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
23306 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23307 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
23308 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23309 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
23310 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23311 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
23312 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23313 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
23314 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23315 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
23316 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23317 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
23318 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23319 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
23320 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23321 ; GFX7-NEXT: s_waitcnt vmcnt(1)
23322 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
23323 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23324 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23325 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23326 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23327 ; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
23328 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
23329 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23330 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23331 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23332 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23333 ; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
23334 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
23335 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23336 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23337 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23338 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23339 ; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
23340 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
23341 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23342 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23343 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23344 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23345 ; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
23346 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
23347 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23348 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23349 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23350 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23351 ; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
23352 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
23353 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23354 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23355 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23356 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23357 ; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
23358 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
23359 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23360 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23361 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23362 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23363 ; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
23364 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
23365 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23366 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23367 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23368 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23369 ; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
23370 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
23371 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23372 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23373 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23374 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23375 ; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
23376 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
23377 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23378 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23379 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23380 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23381 ; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
23382 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
23383 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23384 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23385 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23386 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23387 ; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
23388 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
23389 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23390 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23391 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23392 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23393 ; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
23394 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
23395 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23396 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23397 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23398 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23399 ; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
23400 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
23401 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23402 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23403 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23404 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23405 ; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
23406 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
23407 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23408 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23409 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23410 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23411 ; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
23412 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
23413 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23414 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23415 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23416 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23417 ; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
23418 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
23419 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23420 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23421 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23422 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23423 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
23424 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
23425 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23426 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23427 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23428 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23429 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
23430 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
23431 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23432 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23433 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23434 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23435 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
23436 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
23437 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23438 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23439 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23440 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23441 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
23442 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
23443 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23444 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23445 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23446 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23447 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
23448 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
23449 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23450 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23451 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23452 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23453 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
23454 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
23455 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23456 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23457 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23458 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23459 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
23460 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
23461 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23462 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23463 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23464 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23465 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
23466 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
23467 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23468 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23469 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23470 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23471 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
23472 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
23473 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23474 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23475 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23476 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23477 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
23478 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
23479 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23480 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23481 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23482 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23483 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
23484 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
23485 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23486 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23487 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23488 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23489 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
23490 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
23491 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23492 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23493 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23494 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23495 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
23496 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
23497 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23498 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23499 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23500 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23501 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
23502 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
23503 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23504 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23505 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23506 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23507 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
23508 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
23509 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23510 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23511 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23512 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23513 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
23514 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23515 ; GFX7-NEXT: s_setpc_b64 s[30:31]
23517 ; GFX8-LABEL: v_maxnum_v32bf16:
23519 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23520 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
23521 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
23522 ; GFX8-NEXT: v_max_f32_e32 v31, v32, v31
23523 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
23524 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
23525 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
23526 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23527 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23528 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
23529 ; GFX8-NEXT: v_max_f32_e32 v14, v14, v30
23530 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
23531 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
23532 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
23533 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
23534 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
23535 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
23536 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
23537 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
23538 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
23539 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
23540 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
23541 ; GFX8-NEXT: v_max_f32_e32 v32, v32, v30
23542 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
23543 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
23544 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23545 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23546 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23547 ; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
23548 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
23549 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
23550 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
23551 ; GFX8-NEXT: s_waitcnt vmcnt(0)
23552 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
23553 ; GFX8-NEXT: v_max_f32_e32 v33, v33, v34
23554 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23555 ; GFX8-NEXT: v_max_f32_e32 v30, v15, v30
23556 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
23557 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
23558 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
23559 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
23560 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
23561 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
23562 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
23563 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
23564 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23565 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
23566 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
23567 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
23568 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
23569 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
23570 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23571 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
23572 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
23573 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
23574 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
23575 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
23576 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
23577 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
23578 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
23579 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
23580 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
23581 ; GFX8-NEXT: v_max_f32_e32 v29, v33, v29
23582 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
23583 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
23584 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23585 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23586 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23587 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
23588 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
23589 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
23590 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
23591 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
23592 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
23593 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
23594 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
23595 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
23596 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
23597 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
23598 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
23599 ; GFX8-NEXT: v_max_f32_e32 v28, v33, v28
23600 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
23601 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
23602 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23603 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23604 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23605 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
23606 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
23607 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
23608 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
23609 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
23610 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
23611 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
23612 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
23613 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
23614 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
23615 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
23616 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
23617 ; GFX8-NEXT: v_max_f32_e32 v27, v33, v27
23618 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
23619 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
23620 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23621 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23622 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23623 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
23624 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
23625 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
23626 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
23627 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
23628 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
23629 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
23630 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
23631 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
23632 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
23633 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
23634 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
23635 ; GFX8-NEXT: v_max_f32_e32 v26, v33, v26
23636 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
23637 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
23638 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23639 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23640 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23641 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
23642 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
23643 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
23644 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
23645 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
23646 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
23647 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
23648 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
23649 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
23650 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
23651 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
23652 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
23653 ; GFX8-NEXT: v_max_f32_e32 v25, v33, v25
23654 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
23655 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
23656 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23657 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23658 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23659 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
23660 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
23661 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
23662 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
23663 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
23664 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
23665 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
23666 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
23667 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
23668 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
23669 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
23670 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
23671 ; GFX8-NEXT: v_max_f32_e32 v24, v33, v24
23672 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
23673 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
23674 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23675 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23676 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23677 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
23678 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
23679 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
23680 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
23681 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
23682 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
23683 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
23684 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
23685 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
23686 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
23687 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
23688 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
23689 ; GFX8-NEXT: v_max_f32_e32 v23, v33, v23
23690 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
23691 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
23692 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23693 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23694 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23695 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
23696 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
23697 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
23698 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
23699 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
23700 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
23701 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
23702 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
23703 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
23704 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
23705 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
23706 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
23707 ; GFX8-NEXT: v_max_f32_e32 v22, v33, v22
23708 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
23709 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
23710 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23711 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23712 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23713 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v21
23714 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
23715 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
23716 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
23717 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
23718 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
23719 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
23720 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
23721 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
23722 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
23723 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
23724 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
23725 ; GFX8-NEXT: v_max_f32_e32 v21, v33, v21
23726 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
23727 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
23728 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23729 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23730 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23731 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
23732 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
23733 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
23734 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
23735 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
23736 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
23737 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
23738 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
23739 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
23740 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
23741 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
23742 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
23743 ; GFX8-NEXT: v_max_f32_e32 v20, v33, v20
23744 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
23745 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
23746 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23747 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23748 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23749 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
23750 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
23751 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
23752 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
23753 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
23754 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
23755 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
23756 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
23757 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
23758 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
23759 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
23760 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
23761 ; GFX8-NEXT: v_max_f32_e32 v19, v33, v19
23762 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
23763 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
23764 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23765 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23766 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23767 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
23768 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
23769 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
23770 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
23771 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
23772 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
23773 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
23774 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
23775 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
23776 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
23777 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
23778 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
23779 ; GFX8-NEXT: v_max_f32_e32 v18, v33, v18
23780 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
23781 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
23782 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23783 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23784 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23785 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
23786 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
23787 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
23788 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
23789 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
23790 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
23791 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
23792 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
23793 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
23794 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
23795 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
23796 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
23797 ; GFX8-NEXT: v_max_f32_e32 v17, v33, v17
23798 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
23799 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
23800 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23801 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23802 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23803 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
23804 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
23805 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
23806 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
23807 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
23808 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
23809 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
23810 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
23811 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
23812 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
23813 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
23814 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
23815 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
23816 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
23817 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
23818 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
23819 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
23820 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
23821 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
23822 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
23823 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
23824 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
23825 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
23826 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
23827 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
23828 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
23829 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
23830 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
23831 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
23832 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
23833 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
23834 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
23835 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
23836 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
23837 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
23838 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
23839 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
23840 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
23841 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
23842 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
23843 ; GFX8-NEXT: s_setpc_b64 s[30:31]
23845 ; GFX9-LABEL: v_maxnum_v32bf16:
23847 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23848 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
23849 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
23850 ; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
23851 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23852 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23853 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
23854 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
23855 ; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
23856 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
23857 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
23858 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
23859 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
23860 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
23861 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
23862 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
23863 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
23864 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
23865 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
23866 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
23867 ; GFX9-NEXT: v_max_f32_e32 v30, v32, v30
23868 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23869 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23870 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
23871 ; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
23872 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
23873 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
23874 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
23875 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
23876 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
23877 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
23878 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
23879 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
23880 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
23881 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
23882 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
23883 ; GFX9-NEXT: v_max_f32_e32 v32, v32, v29
23884 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
23885 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
23886 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23887 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23888 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23889 ; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
23890 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
23891 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
23892 ; GFX9-NEXT: s_waitcnt vmcnt(0)
23893 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
23894 ; GFX9-NEXT: v_max_f32_e32 v33, v33, v34
23895 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23896 ; GFX9-NEXT: v_max_f32_e32 v29, v15, v29
23897 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
23898 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
23899 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
23900 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
23901 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
23902 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
23903 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
23904 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
23905 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
23906 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
23907 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
23908 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
23909 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
23910 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
23911 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
23912 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
23913 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
23914 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
23915 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
23916 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
23917 ; GFX9-NEXT: v_max_f32_e32 v28, v33, v28
23918 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23919 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23920 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
23921 ; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
23922 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
23923 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
23924 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
23925 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
23926 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
23927 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
23928 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
23929 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
23930 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
23931 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
23932 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
23933 ; GFX9-NEXT: v_max_f32_e32 v27, v33, v27
23934 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23935 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23936 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
23937 ; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
23938 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
23939 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
23940 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
23941 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
23942 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
23943 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
23944 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
23945 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
23946 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
23947 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
23948 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
23949 ; GFX9-NEXT: v_max_f32_e32 v26, v33, v26
23950 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23951 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23952 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
23953 ; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
23954 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
23955 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
23956 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
23957 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
23958 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
23959 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
23960 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
23961 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
23962 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
23963 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
23964 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
23965 ; GFX9-NEXT: v_max_f32_e32 v25, v33, v25
23966 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23967 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23968 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
23969 ; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
23970 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
23971 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
23972 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
23973 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
23974 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
23975 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
23976 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
23977 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
23978 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
23979 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
23980 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
23981 ; GFX9-NEXT: v_max_f32_e32 v24, v33, v24
23982 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23983 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23984 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
23985 ; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
23986 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
23987 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
23988 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
23989 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
23990 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
23991 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
23992 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
23993 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
23994 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
23995 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
23996 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
23997 ; GFX9-NEXT: v_max_f32_e32 v23, v33, v23
23998 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23999 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
24000 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
24001 ; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
24002 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
24003 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
24004 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
24005 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
24006 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
24007 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
24008 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
24009 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
24010 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
24011 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
24012 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
24013 ; GFX9-NEXT: v_max_f32_e32 v22, v33, v22
24014 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
24015 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
24016 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
24017 ; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
24018 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
24019 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
24020 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
24021 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
24022 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
24023 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
24024 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
24025 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
24026 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
24027 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
24028 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
24029 ; GFX9-NEXT: v_max_f32_e32 v21, v33, v21
24030 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
24031 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
24032 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
24033 ; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
24034 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
24035 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
24036 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
24037 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
24038 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
24039 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
24040 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
24041 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
24042 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
24043 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
24044 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
24045 ; GFX9-NEXT: v_max_f32_e32 v20, v33, v20
24046 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
24047 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
24048 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
24049 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
24050 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
24051 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
24052 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
24053 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
24054 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
24055 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
24056 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
24057 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
24058 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
24059 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
24060 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
24061 ; GFX9-NEXT: v_max_f32_e32 v19, v33, v19
24062 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
24063 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
24064 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
24065 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
24066 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
24067 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
24068 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
24069 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
24070 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
24071 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
24072 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
24073 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
24074 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
24075 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
24076 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
24077 ; GFX9-NEXT: v_max_f32_e32 v18, v33, v18
24078 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
24079 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
24080 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
24081 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
24082 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
24083 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
24084 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
24085 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
24086 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
24087 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
24088 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
24089 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
24090 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
24091 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
24092 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
24093 ; GFX9-NEXT: v_max_f32_e32 v17, v33, v17
24094 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
24095 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24096 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
24097 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
24098 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
24099 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
24100 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
24101 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
24102 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
24103 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
24104 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
24105 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24106 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
24107 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
24108 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
24109 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
24110 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
24111 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
24112 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
24113 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
24114 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
24115 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
24116 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
24117 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
24118 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
24119 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
24120 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
24121 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
24122 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
24123 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
24124 ; GFX9-NEXT: s_setpc_b64 s[30:31]
24126 ; GFX10-LABEL: v_maxnum_v32bf16:
24128 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24129 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
24130 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
24131 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
24132 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
24133 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
24134 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
24135 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
24136 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
24137 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
24138 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
24139 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
24140 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
24141 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
24142 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
24143 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
24144 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
24145 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
24146 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
24147 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
24148 ; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
24149 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
24150 ; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
24151 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
24152 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
24153 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
24154 ; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
24155 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
24156 ; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
24157 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
24158 ; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
24159 ; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
24160 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
24161 ; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
24162 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
24163 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
24164 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
24165 ; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
24166 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
24167 ; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
24168 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
24169 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
24170 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
24171 ; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
24172 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
24173 ; GFX10-NEXT: v_max_f32_e32 v27, v50, v27
24174 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
24175 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
24176 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24177 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
24178 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
24179 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
24180 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
24181 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
24182 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
24183 ; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
24184 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
24185 ; GFX10-NEXT: v_max_f32_e32 v29, v38, v29
24186 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
24187 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
24188 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
24189 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
24190 ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
24191 ; GFX10-NEXT: v_max_f32_e32 v28, v48, v28
24192 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
24193 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
24194 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
24195 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
24196 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
24197 ; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
24198 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
24199 ; GFX10-NEXT: v_max_f32_e32 v34, v34, v51
24200 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
24201 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
24202 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
24203 ; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
24204 ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
24205 ; GFX10-NEXT: v_max_f32_e32 v30, v36, v30
24206 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
24207 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
24208 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
24209 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
24210 ; GFX10-NEXT: v_max_f32_e32 v18, v48, v23
24211 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
24212 ; GFX10-NEXT: v_max_f32_e32 v17, v50, v22
24213 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
24214 ; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
24215 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
24216 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
24217 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
24218 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
24219 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
24220 ; GFX10-NEXT: v_max_f32_e32 v20, v36, v25
24221 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
24222 ; GFX10-NEXT: v_max_f32_e32 v19, v38, v24
24223 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
24224 ; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
24225 ; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
24226 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
24227 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
24228 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
24229 ; GFX10-NEXT: v_max_f32_e32 v21, v51, v26
24230 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
24231 ; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
24232 ; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
24233 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
24234 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
24235 ; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
24236 ; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
24237 ; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
24238 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
24239 ; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
24240 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
24241 ; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
24242 ; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
24243 ; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
24244 ; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
24245 ; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
24246 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
24247 ; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
24248 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
24249 ; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
24250 ; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
24251 ; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
24252 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
24253 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
24254 ; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
24255 ; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
24256 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
24257 ; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
24258 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
24259 ; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
24260 ; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
24261 ; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
24262 ; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
24263 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
24264 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
24265 ; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
24266 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
24267 ; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
24268 ; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
24269 ; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
24270 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
24271 ; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
24272 ; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
24273 ; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
24274 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
24275 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
24276 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
24277 ; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
24278 ; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
24279 ; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
24280 ; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
24281 ; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
24282 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
24283 ; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
24284 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
24285 ; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
24286 ; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
24287 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
24288 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
24289 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
24290 ; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
24291 ; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
24292 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
24293 ; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
24294 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
24295 ; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
24296 ; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
24297 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
24298 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
24299 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
24300 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
24301 ; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
24302 ; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
24303 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
24304 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
24305 ; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
24306 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
24307 ; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
24308 ; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
24309 ; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
24310 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
24311 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
24312 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
24313 ; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
24314 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
24315 ; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
24316 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
24317 ; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
24318 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
24319 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
24320 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
24321 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
24322 ; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
24323 ; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
24324 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
24325 ; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
24326 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
24327 ; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
24328 ; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
24329 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
24330 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
24331 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
24332 ; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
24333 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
24334 ; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
24335 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
24336 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
24337 ; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
24338 ; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
24339 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
24340 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
24341 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
24342 ; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
24343 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
24344 ; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
24345 ; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
24346 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
24347 ; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
24348 ; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
24349 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
24350 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
24351 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
24352 ; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
24353 ; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
24354 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
24355 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
24356 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
24357 ; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
24358 ; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
24359 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
24360 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
24361 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
24362 ; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
24363 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
24364 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24365 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
24366 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
24367 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24368 ; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
24369 ; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
24370 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
24371 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
24372 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
24373 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
24374 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
24375 ; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
24376 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
24377 ; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
24378 ; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
24379 ; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
24380 ; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
24381 ; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
24382 ; GFX10-NEXT: s_waitcnt vmcnt(0)
24383 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
24384 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
24385 ; GFX10-NEXT: v_max_f32_e32 v17, v31, v17
24386 ; GFX10-NEXT: v_max_f32_e32 v15, v15, v18
24387 ; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
24388 ; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
24389 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
24390 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24391 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
24392 ; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
24393 ; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
24394 ; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
24395 ; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
24396 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
24397 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
24398 ; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
24399 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
24400 ; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
24401 ; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
24402 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
24403 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24405 ; GFX11-LABEL: v_maxnum_v32bf16:
24407 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24408 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
24409 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
24410 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
24411 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
24412 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
24413 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
24414 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
24415 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
24416 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
24417 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
24418 ; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
24419 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
24420 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24421 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
24422 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
24423 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
24424 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
24425 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
24426 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
24427 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
24428 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
24429 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
24430 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
24431 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
24432 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
24433 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
24434 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
24435 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24436 ; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
24437 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
24438 ; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
24439 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
24440 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
24441 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
24442 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24443 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
24444 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
24445 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
24446 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
24447 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
24448 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
24449 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
24450 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
24451 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24452 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
24453 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24454 ; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
24455 ; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
24456 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
24457 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
24458 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24459 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
24460 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
24461 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
24462 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
24463 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
24464 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
24465 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
24466 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
24467 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
24468 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
24469 ; GFX11-NEXT: v_max_f32_e32 v2, v2, v18
24470 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v16
24471 ; GFX11-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
24472 ; GFX11-NEXT: v_max_f32_e32 v7, v7, v23
24473 ; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
24474 ; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
24475 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
24476 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
24477 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
24478 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
24479 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
24480 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
24481 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
24482 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
24483 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
24484 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
24485 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
24486 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
24487 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
24488 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
24489 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
24490 ; GFX11-NEXT: v_max_f32_e32 v4, v4, v20
24491 ; GFX11-NEXT: v_max_f32_e32 v20, v80, v71
24492 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
24493 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
24494 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
24495 ; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
24496 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24497 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
24498 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
24499 ; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
24500 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
24501 ; GFX11-NEXT: v_max_f32_e32 v26, v52, v51
24502 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24503 ; GFX11-NEXT: v_max_f32_e32 v6, v6, v22
24504 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
24505 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
24506 ; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
24507 ; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
24508 ; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
24509 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
24510 ; GFX11-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
24511 ; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
24512 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24513 ; GFX11-NEXT: v_max_f32_e32 v29, v38, v37
24514 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
24515 ; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
24516 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
24517 ; GFX11-NEXT: v_max_f32_e32 v14, v14, v30
24518 ; GFX11-NEXT: v_max_f32_e32 v28, v48, v39
24519 ; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
24520 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
24521 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
24522 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
24523 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24524 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
24525 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
24526 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
24527 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
24528 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
24529 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
24530 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
24531 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
24532 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
24533 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
24534 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
24535 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
24536 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
24537 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
24538 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
24539 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
24540 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
24541 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
24542 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
24543 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
24544 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
24545 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
24546 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
24547 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
24548 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
24549 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
24550 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
24551 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
24552 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
24553 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
24554 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
24555 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
24556 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
24557 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
24558 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
24559 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
24560 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
24561 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
24562 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
24563 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
24564 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
24565 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
24566 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
24567 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
24568 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
24569 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
24570 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
24571 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
24572 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
24573 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
24574 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
24575 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
24576 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
24577 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
24578 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
24579 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
24580 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
24581 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
24582 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
24583 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
24584 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
24585 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
24586 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
24587 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
24588 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
24589 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
24590 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
24591 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
24592 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
24593 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
24594 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
24595 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
24596 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
24597 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
24598 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
24599 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
24600 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
24601 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
24602 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
24603 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
24604 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
24605 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
24606 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
24607 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
24608 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
24609 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
24610 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
24611 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
24612 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
24613 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
24614 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
24615 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
24616 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
24617 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
24618 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
24619 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
24620 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
24621 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
24622 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
24623 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
24624 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24625 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
24626 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
24627 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
24628 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
24629 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
24630 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
24631 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
24632 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
24633 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
24634 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
24635 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24636 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
24637 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
24638 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
24639 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
24640 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
24641 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
24642 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
24643 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
24644 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
24645 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
24646 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
24647 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
24648 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
24649 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24650 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24651 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
24652 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
24653 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24654 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
24655 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
24656 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
24657 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
24658 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
24659 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
24660 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
24661 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
24662 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
24663 ; GFX11-NEXT: s_waitcnt vmcnt(0)
24664 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
24665 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24666 ; GFX11-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
24667 ; GFX11-NEXT: v_max_f32_e32 v15, v15, v18
24668 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
24669 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
24670 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
24671 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
24672 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24673 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
24674 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
24675 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
24676 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24677 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
24678 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
24679 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
24680 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
24681 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
24682 ; GFX11-NEXT: s_setpc_b64 s[30:31]
24683 %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
24684 ret <32 x bfloat> %op
24687 declare bfloat @llvm.sqrt.bf16(bfloat)
24689 define bfloat @v_sqrt_bf16(bfloat %a) {
24690 ; GCN-LABEL: v_sqrt_bf16:
24692 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24693 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
24694 ; GCN-NEXT: s_mov_b32 s4, 0xf800000
24695 ; GCN-NEXT: v_mov_b32_e32 v1, 0x260
24696 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24697 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
24698 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24699 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
24700 ; GCN-NEXT: v_sqrt_f32_e32 v2, v0
24701 ; GCN-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
24702 ; GCN-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
24703 ; GCN-NEXT: v_fma_f32 v5, -v3, v2, v0
24704 ; GCN-NEXT: v_fma_f32 v6, -v4, v2, v0
24705 ; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
24706 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
24707 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
24708 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
24709 ; GCN-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
24710 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
24711 ; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
24712 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
24713 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24714 ; GCN-NEXT: s_setpc_b64 s[30:31]
24716 ; GFX7-LABEL: v_sqrt_bf16:
24718 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24719 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
24720 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24721 ; GFX7-NEXT: s_mov_b32 s4, 0xf800000
24722 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24723 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24724 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24725 ; GFX7-NEXT: v_sqrt_f32_e32 v1, v0
24726 ; GFX7-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
24727 ; GFX7-NEXT: v_fma_f32 v3, -v2, v1, v0
24728 ; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
24729 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24730 ; GFX7-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
24731 ; GFX7-NEXT: v_fma_f32 v1, -v3, v1, v0
24732 ; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
24733 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24734 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24735 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
24736 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
24737 ; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
24738 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24739 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24740 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24742 ; GFX8-LABEL: v_sqrt_bf16:
24744 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24745 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24746 ; GFX8-NEXT: s_mov_b32 s4, 0xf800000
24747 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24748 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24749 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24750 ; GFX8-NEXT: v_sqrt_f32_e32 v1, v0
24751 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], -1, v1
24752 ; GFX8-NEXT: v_fma_f32 v3, -v2, v1, v0
24753 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
24754 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24755 ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], 1, v1
24756 ; GFX8-NEXT: v_fma_f32 v1, -v3, v1, v0
24757 ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
24758 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24759 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24760 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
24761 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x260
24762 ; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
24763 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24764 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
24765 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
24766 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
24767 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
24768 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24769 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24770 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24771 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24773 ; GFX9-LABEL: v_sqrt_bf16:
24775 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24776 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24777 ; GFX9-NEXT: s_mov_b32 s4, 0xf800000
24778 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24779 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24780 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24781 ; GFX9-NEXT: v_sqrt_f32_e32 v1, v0
24782 ; GFX9-NEXT: v_add_u32_e32 v2, -1, v1
24783 ; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0
24784 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
24785 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1
24786 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24787 ; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0
24788 ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
24789 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24790 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24791 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
24792 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260
24793 ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
24794 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24795 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
24796 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
24797 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
24798 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
24799 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24800 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24801 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24802 ; GFX9-NEXT: s_setpc_b64 s[30:31]
24804 ; GFX10-LABEL: v_sqrt_bf16:
24806 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24807 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24808 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24809 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
24810 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
24811 ; GFX10-NEXT: v_sqrt_f32_e32 v1, v0
24812 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v1
24813 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v1
24814 ; GFX10-NEXT: v_fma_f32 v4, -v2, v1, v0
24815 ; GFX10-NEXT: v_fma_f32 v5, -v3, v1, v0
24816 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, 0, v4
24817 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4
24818 ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5
24819 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
24820 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24821 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
24822 ; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
24823 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
24824 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
24825 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
24826 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24827 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24828 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24829 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24830 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24832 ; GFX11-LABEL: v_sqrt_bf16:
24834 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24835 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24836 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
24837 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24838 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
24839 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
24840 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
24841 ; GFX11-NEXT: v_sqrt_f32_e32 v1, v0
24842 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
24843 ; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v1
24844 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1
24845 ; GFX11-NEXT: v_fma_f32 v4, -v2, v1, v0
24846 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
24847 ; GFX11-NEXT: v_fma_f32 v5, -v3, v1, v0
24848 ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
24849 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
24850 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
24851 ; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
24852 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24853 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
24854 ; GFX11-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24855 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
24856 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
24857 ; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
24858 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
24859 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
24860 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
24861 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
24862 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24863 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24864 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24865 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24866 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24867 ; GFX11-NEXT: s_setpc_b64 s[30:31]
24868 %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
24872 declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
24874 define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
24875 ; GCN-LABEL: v_ldexp_bf16_i32:
24877 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24878 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
24879 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24880 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
24881 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24882 ; GCN-NEXT: s_setpc_b64 s[30:31]
24884 ; GFX7-LABEL: v_ldexp_bf16_i32:
24886 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24887 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
24888 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24889 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
24890 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24891 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24893 ; GFX8-LABEL: v_ldexp_bf16_i32:
24895 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24896 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24897 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
24898 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
24899 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
24900 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
24901 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
24902 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24903 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24904 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24905 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24907 ; GFX9-LABEL: v_ldexp_bf16_i32:
24909 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24910 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24911 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
24912 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
24913 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
24914 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
24915 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
24916 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24917 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24918 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24919 ; GFX9-NEXT: s_setpc_b64 s[30:31]
24921 ; GFX10-LABEL: v_ldexp_bf16_i32:
24923 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24924 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24925 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
24926 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
24927 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
24928 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24929 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24930 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24931 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24932 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24934 ; GFX11-LABEL: v_ldexp_bf16_i32:
24936 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24937 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24938 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24939 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
24940 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
24941 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
24942 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24943 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
24944 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24945 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24946 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
24947 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24948 ; GFX11-NEXT: s_setpc_b64 s[30:31]
24949 %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
24953 declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
24955 define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
24956 ; GCN-LABEL: v_frexp_bf16_i16:
24958 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24959 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
24960 ; GCN-NEXT: s_mov_b32 s4, 0x7f800000
24961 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24962 ; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0
24963 ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
24964 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
24965 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24966 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
24967 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24968 ; GCN-NEXT: s_setpc_b64 s[30:31]
24970 ; GFX7-LABEL: v_frexp_bf16_i16:
24972 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24973 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
24974 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24975 ; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
24976 ; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0
24977 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24978 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24980 ; GFX8-LABEL: v_frexp_bf16_i16:
24982 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24983 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
24984 ; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v1
24985 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
24986 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
24987 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
24988 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
24989 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24990 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
24991 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24992 ; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
24993 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24995 ; GFX9-LABEL: v_frexp_bf16_i16:
24997 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24998 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
24999 ; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1
25000 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
25001 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25002 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
25003 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
25004 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25005 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
25006 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25007 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
25008 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25010 ; GFX10-LABEL: v_frexp_bf16_i16:
25012 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25013 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
25014 ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1
25015 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
25016 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
25017 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
25018 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25019 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
25020 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
25021 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25022 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25024 ; GFX11-LABEL: v_frexp_bf16_i16:
25026 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25027 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
25028 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25029 ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v1
25030 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
25031 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
25032 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25033 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25034 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
25035 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
25036 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
25037 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
25038 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25039 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25040 %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
25041 ret { bfloat, i16 } %op
25045 declare bfloat @llvm.log.bf16(bfloat)
25046 declare bfloat @llvm.log2.bf16(bfloat)
25047 declare bfloat @llvm.log10.bf16(bfloat)
25049 define bfloat @v_log_bf16(bfloat %a) {
25050 ; GCN-LABEL: v_log_bf16:
25052 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25053 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25054 ; GCN-NEXT: s_mov_b32 s4, 0x800000
25055 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000
25056 ; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218
25057 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25058 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25059 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
25060 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
25061 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
25062 ; GCN-NEXT: v_log_f32_e32 v0, v0
25063 ; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
25064 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
25065 ; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
25066 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
25067 ; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
25068 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
25069 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25070 ; GCN-NEXT: v_add_f32_e32 v3, v5, v3
25071 ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
25072 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
25073 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
25074 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25075 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
25076 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25077 ; GCN-NEXT: s_setpc_b64 s[30:31]
25079 ; GFX7-LABEL: v_log_bf16:
25081 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25082 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25083 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25084 ; GFX7-NEXT: s_mov_b32 s4, 0x800000
25085 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25086 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25087 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25088 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
25089 ; GFX7-NEXT: v_log_f32_e32 v0, v0
25090 ; GFX7-NEXT: s_mov_b32 s4, 0x3f317217
25091 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25092 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25093 ; GFX7-NEXT: s_mov_b32 s4, 0x3377d1cf
25094 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25095 ; GFX7-NEXT: s_mov_b32 s4, 0x7f800000
25096 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25097 ; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25098 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25099 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218
25100 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25101 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
25102 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25103 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25105 ; GFX8-LABEL: v_log_bf16:
25107 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25108 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25109 ; GFX8-NEXT: s_mov_b32 s4, 0x800000
25110 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25111 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25112 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25113 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
25114 ; GFX8-NEXT: v_log_f32_e32 v0, v0
25115 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
25116 ; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
25117 ; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
25118 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x3f317000, v2
25119 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2
25120 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
25121 ; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
25122 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
25123 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
25124 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
25125 ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25126 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25127 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x41b17218
25128 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25129 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
25130 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25131 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25132 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25133 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25134 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25135 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25136 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25137 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25139 ; GFX9-LABEL: v_log_bf16:
25141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25142 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25143 ; GFX9-NEXT: s_mov_b32 s4, 0x800000
25144 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25145 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25146 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25147 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
25148 ; GFX9-NEXT: v_log_f32_e32 v0, v0
25149 ; GFX9-NEXT: s_mov_b32 s4, 0x3f317217
25150 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25151 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
25152 ; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf
25153 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
25154 ; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
25155 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
25156 ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25157 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25158 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218
25159 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25160 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
25161 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25162 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25163 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25164 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25165 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25166 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25167 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25168 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25170 ; GFX10-LABEL: v_log_bf16:
25172 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25173 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25174 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25175 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25176 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25177 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
25178 ; GFX10-NEXT: v_log_f32_e32 v0, v0
25179 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25180 ; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
25181 ; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
25182 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
25183 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
25184 ; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25185 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25186 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
25187 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25188 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25189 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25190 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25191 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25192 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25193 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25195 ; GFX11-LABEL: v_log_bf16:
25197 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25198 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
25200 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25201 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25202 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25203 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25204 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
25205 ; GFX11-NEXT: v_log_f32_e32 v0, v0
25206 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25207 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25208 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25209 ; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
25210 ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
25211 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25212 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
25213 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
25214 ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25215 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25216 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25217 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
25218 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25219 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25220 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25221 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25222 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25223 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25224 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
25225 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25226 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25227 %op = call bfloat @llvm.log.bf16(bfloat %a)
25231 define bfloat @v_log2_bf16(bfloat %a) {
25232 ; GCN-LABEL: v_log2_bf16:
25234 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25235 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25236 ; GCN-NEXT: s_mov_b32 s4, 0x800000
25237 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000
25238 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25239 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25240 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
25241 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
25242 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
25243 ; GCN-NEXT: v_log_f32_e32 v0, v0
25244 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25245 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
25246 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25247 ; GCN-NEXT: s_setpc_b64 s[30:31]
25249 ; GFX7-LABEL: v_log2_bf16:
25251 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25252 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25253 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25254 ; GFX7-NEXT: s_mov_b32 s4, 0x800000
25255 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25256 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25257 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25258 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
25259 ; GFX7-NEXT: v_log_f32_e32 v0, v0
25260 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
25261 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25262 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
25263 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25264 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25266 ; GFX8-LABEL: v_log2_bf16:
25268 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25269 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25270 ; GFX8-NEXT: s_mov_b32 s4, 0x800000
25271 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25272 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25273 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25274 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
25275 ; GFX8-NEXT: v_log_f32_e32 v0, v0
25276 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
25277 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25278 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
25279 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25280 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25281 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25282 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25283 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25284 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25285 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25286 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25288 ; GFX9-LABEL: v_log2_bf16:
25290 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25291 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25292 ; GFX9-NEXT: s_mov_b32 s4, 0x800000
25293 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25294 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
25295 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
25296 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
25297 ; GFX9-NEXT: v_log_f32_e32 v0, v0
25298 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
25299 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25300 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25301 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
25302 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25303 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25304 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25305 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25306 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25307 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25308 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25310 ; GFX10-LABEL: v_log2_bf16:
25312 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25313 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25314 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25315 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
25316 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
25317 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
25318 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
25319 ; GFX10-NEXT: v_log_f32_e32 v0, v0
25320 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
25321 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25322 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25323 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25324 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25325 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25326 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25327 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25329 ; GFX11-LABEL: v_log2_bf16:
25331 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25332 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25333 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
25334 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25335 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
25336 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
25337 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
25338 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25339 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
25340 ; GFX11-NEXT: v_log_f32_e32 v0, v0
25341 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25342 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
25343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25344 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25345 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25346 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25347 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25348 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25349 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25350 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25351 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25352 %op = call bfloat @llvm.log2.bf16(bfloat %a)
25356 define bfloat @v_log10_bf16(bfloat %a) {
25357 ; GCN-LABEL: v_log10_bf16:
25359 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25360 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25361 ; GCN-NEXT: s_mov_b32 s4, 0x800000
25362 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000
25363 ; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b
25364 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25365 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25366 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
25367 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
25368 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
25369 ; GCN-NEXT: v_log_f32_e32 v0, v0
25370 ; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
25371 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
25372 ; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
25373 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
25374 ; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
25375 ; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
25376 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25377 ; GCN-NEXT: v_add_f32_e32 v3, v5, v3
25378 ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
25379 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
25380 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
25381 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25382 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
25383 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25384 ; GCN-NEXT: s_setpc_b64 s[30:31]
25386 ; GFX7-LABEL: v_log10_bf16:
25388 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25389 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25390 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25391 ; GFX7-NEXT: s_mov_b32 s4, 0x800000
25392 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25393 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25394 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25395 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
25396 ; GFX7-NEXT: v_log_f32_e32 v0, v0
25397 ; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a
25398 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25399 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25400 ; GFX7-NEXT: s_mov_b32 s4, 0x3284fbcf
25401 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25402 ; GFX7-NEXT: s_mov_b32 s4, 0x7f800000
25403 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25404 ; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25405 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25406 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b
25407 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25408 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
25409 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25410 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25412 ; GFX8-LABEL: v_log10_bf16:
25414 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25415 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25416 ; GFX8-NEXT: s_mov_b32 s4, 0x800000
25417 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25418 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25419 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25420 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
25421 ; GFX8-NEXT: v_log_f32_e32 v0, v0
25422 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
25423 ; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
25424 ; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
25425 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v2
25426 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2
25427 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
25428 ; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
25429 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
25430 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
25431 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
25432 ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25433 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25434 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x411a209b
25435 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25436 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
25437 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25438 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25439 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25440 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25441 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25442 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25443 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25444 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25446 ; GFX9-LABEL: v_log10_bf16:
25448 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25449 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25450 ; GFX9-NEXT: s_mov_b32 s4, 0x800000
25451 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25452 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
25453 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25454 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
25455 ; GFX9-NEXT: v_log_f32_e32 v0, v0
25456 ; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a
25457 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25458 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
25459 ; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf
25460 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
25461 ; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
25462 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
25463 ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25464 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25465 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b
25466 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25467 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
25468 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25469 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25470 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25471 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25472 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25473 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25474 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25475 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25477 ; GFX10-LABEL: v_log10_bf16:
25479 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25480 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25481 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25482 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25483 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25484 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
25485 ; GFX10-NEXT: v_log_f32_e32 v0, v0
25486 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25487 ; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
25488 ; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
25489 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
25490 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
25491 ; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25492 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25493 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
25494 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25495 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25496 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25497 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25498 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25499 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25500 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25502 ; GFX11-LABEL: v_log10_bf16:
25504 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25505 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25506 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
25507 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25508 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
25509 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
25510 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25511 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
25512 ; GFX11-NEXT: v_log_f32_e32 v0, v0
25513 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25514 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25515 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25516 ; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
25517 ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
25518 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25519 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
25520 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
25521 ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25522 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25523 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25524 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
25525 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25526 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25527 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25528 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25529 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25530 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25531 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
25532 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25533 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25534 %op = call bfloat @llvm.log10.bf16(bfloat %a)
25538 declare bfloat @llvm.exp.bf16(bfloat)
25539 declare bfloat @llvm.exp2.bf16(bfloat)
25540 declare bfloat @llvm.exp10.bf16(bfloat)
25542 define bfloat @v_exp_bf16(bfloat %a) {
25543 ; GCN-LABEL: v_exp_bf16:
25545 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25546 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25547 ; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25548 ; GCN-NEXT: s_mov_b32 s5, 0x42b17218
25549 ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
25550 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25551 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0
25552 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
25553 ; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
25554 ; GCN-NEXT: v_rndne_f32_e32 v5, v2
25555 ; GCN-NEXT: v_mul_f32_e32 v6, 0x39a3b295, v3
25556 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
25557 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
25558 ; GCN-NEXT: v_add_f32_e32 v3, v3, v6
25559 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
25560 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25561 ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
25562 ; GCN-NEXT: v_exp_f32_e32 v2, v2
25563 ; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5
25564 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25565 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
25566 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
25567 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25568 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25569 ; GCN-NEXT: s_setpc_b64 s[30:31]
25571 ; GFX7-LABEL: v_exp_bf16:
25573 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25574 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25575 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25576 ; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b
25577 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25578 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25579 ; GFX7-NEXT: s_mov_b32 s4, 0x32a5705f
25580 ; GFX7-NEXT: v_rndne_f32_e32 v3, v1
25581 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25582 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
25583 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25584 ; GFX7-NEXT: v_exp_f32_e32 v1, v1
25585 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3
25586 ; GFX7-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25587 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25588 ; GFX7-NEXT: s_mov_b32 s4, 0x42b17218
25589 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2
25590 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25591 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
25592 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25593 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25594 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25595 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25597 ; GFX8-LABEL: v_exp_bf16:
25599 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25600 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25601 ; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0
25602 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v0
25603 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v3
25604 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
25605 ; GFX8-NEXT: v_rndne_f32_e32 v2, v1
25606 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
25607 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
25608 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2
25609 ; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
25610 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
25611 ; GFX8-NEXT: v_exp_f32_e32 v1, v1
25612 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
25613 ; GFX8-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25614 ; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25615 ; GFX8-NEXT: s_mov_b32 s4, 0x42b17218
25616 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
25617 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25618 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
25619 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25620 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25621 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25622 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25623 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25624 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25625 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25626 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25627 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25628 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25630 ; GFX9-LABEL: v_exp_bf16:
25632 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25633 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25634 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25635 ; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b
25636 ; GFX9-NEXT: v_rndne_f32_e32 v2, v1
25637 ; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
25638 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
25639 ; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f
25640 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
25641 ; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
25642 ; GFX9-NEXT: v_exp_f32_e32 v1, v1
25643 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
25644 ; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25645 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25646 ; GFX9-NEXT: s_mov_b32 s4, 0x42b17218
25647 ; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
25648 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25649 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
25650 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25651 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25652 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25653 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25654 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25655 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25656 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25657 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25658 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25659 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25661 ; GFX10-LABEL: v_exp_bf16:
25663 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25664 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25665 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25666 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
25667 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1
25668 ; GFX10-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
25669 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
25670 ; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
25671 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
25672 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
25673 ; GFX10-NEXT: v_exp_f32_e32 v1, v1
25674 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
25675 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25676 ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
25677 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25678 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25679 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25680 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25681 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25682 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25683 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25684 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25686 ; GFX11-LABEL: v_exp_bf16:
25688 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25689 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25690 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25691 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25692 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1
25693 ; GFX11-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
25694 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
25695 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2
25696 ; GFX11-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
25697 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
25698 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
25699 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25700 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
25701 ; GFX11-NEXT: v_exp_f32_e32 v1, v1
25702 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25703 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
25704 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
25705 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25706 ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
25707 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25708 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25709 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25710 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25711 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25712 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25713 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25714 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25715 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25716 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25717 %op = call bfloat @llvm.exp.bf16(bfloat %a)
25721 define bfloat @v_exp2_bf16(bfloat %a) {
25722 ; GCN-LABEL: v_exp2_bf16:
25724 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25725 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25726 ; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
25727 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
25728 ; GCN-NEXT: v_not_b32_e32 v2, 63
25729 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25730 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25731 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25732 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1
25733 ; GCN-NEXT: v_exp_f32_e32 v0, v0
25734 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
25735 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
25736 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25737 ; GCN-NEXT: s_setpc_b64 s[30:31]
25739 ; GFX7-LABEL: v_exp2_bf16:
25741 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25742 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25743 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25744 ; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000
25745 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000
25746 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25747 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25748 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
25749 ; GFX7-NEXT: v_exp_f32_e32 v0, v0
25750 ; GFX7-NEXT: v_not_b32_e32 v1, 63
25751 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25752 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
25753 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25754 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25756 ; GFX8-LABEL: v_exp2_bf16:
25758 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25759 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25760 ; GFX8-NEXT: s_mov_b32 s4, 0xc2fc0000
25761 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42800000
25762 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25763 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25764 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
25765 ; GFX8-NEXT: v_exp_f32_e32 v0, v0
25766 ; GFX8-NEXT: v_not_b32_e32 v1, 63
25767 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25768 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
25769 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25770 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25771 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25772 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25773 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25774 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25775 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25776 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25778 ; GFX9-LABEL: v_exp2_bf16:
25780 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25781 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25782 ; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000
25783 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25784 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000
25785 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
25786 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
25787 ; GFX9-NEXT: v_exp_f32_e32 v0, v0
25788 ; GFX9-NEXT: v_not_b32_e32 v1, 63
25789 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25790 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25791 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
25792 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25793 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25794 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25795 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25796 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25797 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25798 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25800 ; GFX10-LABEL: v_exp2_bf16:
25802 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25803 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25804 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
25805 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
25806 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
25807 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
25808 ; GFX10-NEXT: v_exp_f32_e32 v0, v0
25809 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
25810 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25811 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25812 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25813 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25814 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25815 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25816 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25818 ; GFX11-LABEL: v_exp2_bf16:
25820 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25821 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25822 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
25823 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
25824 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
25825 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
25826 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
25827 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
25828 ; GFX11-NEXT: v_exp_f32_e32 v0, v0
25829 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25830 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
25831 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25832 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25833 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25834 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25835 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25836 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
25838 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25839 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25840 %op = call bfloat @llvm.exp2.bf16(bfloat %a)
25844 define bfloat @v_exp10_bf16(bfloat %a) {
25845 ; GCN-LABEL: v_exp10_bf16:
25847 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25848 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25849 ; GCN-NEXT: s_mov_b32 s4, 0xc23369f4
25850 ; GCN-NEXT: s_mov_b32 s5, 0x421a209b
25851 ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
25852 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25853 ; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0
25854 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
25855 ; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
25856 ; GCN-NEXT: v_rndne_f32_e32 v5, v2
25857 ; GCN-NEXT: v_mul_f32_e32 v6, 0x3a2784bc, v3
25858 ; GCN-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
25859 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
25860 ; GCN-NEXT: v_add_f32_e32 v3, v3, v6
25861 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
25862 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25863 ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
25864 ; GCN-NEXT: v_exp_f32_e32 v2, v2
25865 ; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5
25866 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25867 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
25868 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
25869 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25870 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25871 ; GCN-NEXT: s_setpc_b64 s[30:31]
25873 ; GFX7-LABEL: v_exp10_bf16:
25875 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25876 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25877 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25878 ; GFX7-NEXT: s_mov_b32 s4, 0x40549a78
25879 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25880 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25881 ; GFX7-NEXT: s_mov_b32 s4, 0x33979a37
25882 ; GFX7-NEXT: v_rndne_f32_e32 v3, v1
25883 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25884 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
25885 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25886 ; GFX7-NEXT: v_exp_f32_e32 v1, v1
25887 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3
25888 ; GFX7-NEXT: s_mov_b32 s4, 0xc23369f4
25889 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25890 ; GFX7-NEXT: s_mov_b32 s4, 0x421a209b
25891 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2
25892 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25893 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
25894 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25895 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25896 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25897 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25899 ; GFX8-LABEL: v_exp10_bf16:
25901 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25902 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25903 ; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0
25904 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x40549000, v0
25905 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v3
25906 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
25907 ; GFX8-NEXT: v_rndne_f32_e32 v2, v1
25908 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
25909 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
25910 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2
25911 ; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
25912 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
25913 ; GFX8-NEXT: v_exp_f32_e32 v1, v1
25914 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
25915 ; GFX8-NEXT: s_mov_b32 s4, 0xc23369f4
25916 ; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25917 ; GFX8-NEXT: s_mov_b32 s4, 0x421a209b
25918 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
25919 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25920 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
25921 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25922 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25923 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25924 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25925 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25926 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25927 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25928 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25929 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25930 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25932 ; GFX9-LABEL: v_exp10_bf16:
25934 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25935 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25936 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25937 ; GFX9-NEXT: s_mov_b32 s4, 0x40549a78
25938 ; GFX9-NEXT: v_rndne_f32_e32 v2, v1
25939 ; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
25940 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
25941 ; GFX9-NEXT: s_mov_b32 s4, 0x33979a37
25942 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
25943 ; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
25944 ; GFX9-NEXT: v_exp_f32_e32 v1, v1
25945 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
25946 ; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4
25947 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25948 ; GFX9-NEXT: s_mov_b32 s4, 0x421a209b
25949 ; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
25950 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25951 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
25952 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25953 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25954 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25955 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25956 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25957 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25958 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25959 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25960 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25961 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25963 ; GFX10-LABEL: v_exp10_bf16:
25965 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25966 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25967 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25968 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
25969 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1
25970 ; GFX10-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
25971 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
25972 ; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
25973 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
25974 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
25975 ; GFX10-NEXT: v_exp_f32_e32 v1, v1
25976 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
25977 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25978 ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
25979 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25980 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25981 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25982 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25983 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25984 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25985 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25986 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25988 ; GFX11-LABEL: v_exp10_bf16:
25990 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25991 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25992 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25993 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25994 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1
25995 ; GFX11-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
25996 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
25997 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2
25998 ; GFX11-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
25999 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
26000 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
26001 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26002 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
26003 ; GFX11-NEXT: v_exp_f32_e32 v1, v1
26004 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
26005 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
26006 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
26007 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
26008 ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
26009 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
26010 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
26011 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26012 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26013 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26014 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26015 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26016 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26017 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26018 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26019 %op = call bfloat @llvm.exp10.bf16(bfloat %a)
26023 declare bfloat @llvm.ceil.bf16(bfloat)
26025 define bfloat @v_ceil_bf16(bfloat %a) {
26026 ; GCN-LABEL: v_ceil_bf16:
26028 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26029 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26030 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26031 ; GCN-NEXT: v_ceil_f32_e32 v0, v0
26032 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26033 ; GCN-NEXT: s_setpc_b64 s[30:31]
26035 ; GFX7-LABEL: v_ceil_bf16:
26037 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26038 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26039 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26040 ; GFX7-NEXT: v_ceil_f32_e32 v0, v0
26041 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26042 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26044 ; GFX8-LABEL: v_ceil_bf16:
26046 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26047 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26048 ; GFX8-NEXT: v_ceil_f32_e32 v0, v0
26049 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26050 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26051 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26052 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26053 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26054 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26055 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26056 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26058 ; GFX9-LABEL: v_ceil_bf16:
26060 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26061 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26062 ; GFX9-NEXT: v_ceil_f32_e32 v0, v0
26063 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26064 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26065 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26066 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26067 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26068 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26069 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26070 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26072 ; GFX10-LABEL: v_ceil_bf16:
26074 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26075 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26076 ; GFX10-NEXT: v_ceil_f32_e32 v0, v0
26077 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26078 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26079 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26080 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26081 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26082 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26083 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26085 ; GFX11-LABEL: v_ceil_bf16:
26087 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26088 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26089 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26090 ; GFX11-NEXT: v_ceil_f32_e32 v0, v0
26091 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26092 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26093 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26094 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26095 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26096 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26097 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26098 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26099 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26100 %op = call bfloat @llvm.ceil.bf16(bfloat %a)
26104 declare bfloat @llvm.trunc.bf16(bfloat)
26106 define bfloat @v_trunc_bf16(bfloat %a) {
26107 ; GCN-LABEL: v_trunc_bf16:
26109 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26110 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26111 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26112 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
26113 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26114 ; GCN-NEXT: s_setpc_b64 s[30:31]
26116 ; GFX7-LABEL: v_trunc_bf16:
26118 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26119 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26120 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26121 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
26122 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26123 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26125 ; GFX8-LABEL: v_trunc_bf16:
26127 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26128 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26129 ; GFX8-NEXT: v_trunc_f32_e32 v0, v0
26130 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26131 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26132 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26133 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26134 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26135 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26136 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26137 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26139 ; GFX9-LABEL: v_trunc_bf16:
26141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26142 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26143 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0
26144 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26145 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26146 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26147 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26148 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26149 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26150 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26151 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26153 ; GFX10-LABEL: v_trunc_bf16:
26155 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26156 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26157 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
26158 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26159 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26160 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26161 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26162 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26163 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26164 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26166 ; GFX11-LABEL: v_trunc_bf16:
26168 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26169 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26171 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
26172 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26173 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26174 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26175 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26176 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26177 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26178 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26179 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26180 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26181 %op = call bfloat @llvm.trunc.bf16(bfloat %a)
26185 declare bfloat @llvm.rint.bf16(bfloat)
26187 define bfloat @v_rint_bf16(bfloat %a) {
26188 ; GCN-LABEL: v_rint_bf16:
26190 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26191 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26192 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26193 ; GCN-NEXT: v_rndne_f32_e32 v0, v0
26194 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26195 ; GCN-NEXT: s_setpc_b64 s[30:31]
26197 ; GFX7-LABEL: v_rint_bf16:
26199 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26200 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26201 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26202 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0
26203 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26204 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26206 ; GFX8-LABEL: v_rint_bf16:
26208 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26209 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26210 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0
26211 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26212 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26213 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26214 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26215 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26216 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26217 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26218 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26220 ; GFX9-LABEL: v_rint_bf16:
26222 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26223 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26224 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0
26225 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26226 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26227 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26228 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26229 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26230 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26231 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26232 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26234 ; GFX10-LABEL: v_rint_bf16:
26236 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26237 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26238 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0
26239 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26240 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26241 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26242 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26243 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26244 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26245 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26247 ; GFX11-LABEL: v_rint_bf16:
26249 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26250 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26251 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26252 ; GFX11-NEXT: v_rndne_f32_e32 v0, v0
26253 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26254 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26255 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26256 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26257 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26258 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26259 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26260 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26261 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26262 %op = call bfloat @llvm.rint.bf16(bfloat %a)
26266 declare bfloat @llvm.nearbyint.bf16(bfloat)
26268 define bfloat @v_nearbyint_bf16(bfloat %a) {
26269 ; GCN-LABEL: v_nearbyint_bf16:
26271 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26272 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26273 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26274 ; GCN-NEXT: v_rndne_f32_e32 v0, v0
26275 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26276 ; GCN-NEXT: s_setpc_b64 s[30:31]
26278 ; GFX7-LABEL: v_nearbyint_bf16:
26280 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26281 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26282 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26283 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0
26284 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26285 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26287 ; GFX8-LABEL: v_nearbyint_bf16:
26289 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26290 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26291 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0
26292 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26293 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26294 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26295 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26296 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26297 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26298 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26299 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26301 ; GFX9-LABEL: v_nearbyint_bf16:
26303 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26304 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26305 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0
26306 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26307 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26308 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26309 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26310 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26311 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26312 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26313 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26315 ; GFX10-LABEL: v_nearbyint_bf16:
26317 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26318 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26319 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0
26320 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26321 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26322 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26323 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26324 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26325 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26326 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26328 ; GFX11-LABEL: v_nearbyint_bf16:
26330 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26331 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26332 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26333 ; GFX11-NEXT: v_rndne_f32_e32 v0, v0
26334 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26335 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26336 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26337 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26338 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26339 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26340 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26341 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26342 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26343 %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
26347 declare bfloat @llvm.round.bf16(bfloat)
26349 define bfloat @v_round_bf16(bfloat %a) {
26350 ; GCN-LABEL: v_round_bf16:
26352 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26353 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26354 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26355 ; GCN-NEXT: v_trunc_f32_e32 v1, v0
26356 ; GCN-NEXT: v_sub_f32_e32 v2, v0, v1
26357 ; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26358 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26359 ; GCN-NEXT: s_brev_b32 s4, -2
26360 ; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0
26361 ; GCN-NEXT: v_add_f32_e32 v0, v1, v0
26362 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26363 ; GCN-NEXT: s_setpc_b64 s[30:31]
26365 ; GFX7-LABEL: v_round_bf16:
26367 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26368 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26369 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26370 ; GFX7-NEXT: v_trunc_f32_e32 v1, v0
26371 ; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1
26372 ; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26373 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26374 ; GFX7-NEXT: s_brev_b32 s4, -2
26375 ; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0
26376 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
26377 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26378 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26380 ; GFX8-LABEL: v_round_bf16:
26382 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26383 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26384 ; GFX8-NEXT: v_trunc_f32_e32 v1, v0
26385 ; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
26386 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26387 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26388 ; GFX8-NEXT: s_brev_b32 s4, -2
26389 ; GFX8-NEXT: v_bfi_b32 v0, s4, v2, v0
26390 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
26391 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26392 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26393 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26394 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26395 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26396 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26397 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26398 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26400 ; GFX9-LABEL: v_round_bf16:
26402 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26403 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26404 ; GFX9-NEXT: v_trunc_f32_e32 v1, v0
26405 ; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1
26406 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26407 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26408 ; GFX9-NEXT: s_brev_b32 s4, -2
26409 ; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
26410 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
26411 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26412 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26413 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26414 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26415 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26416 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26417 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26418 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26420 ; GFX10-LABEL: v_round_bf16:
26422 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26423 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26424 ; GFX10-NEXT: v_trunc_f32_e32 v1, v0
26425 ; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1
26426 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
26427 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
26428 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
26429 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
26430 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26431 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26432 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26433 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26434 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26435 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26436 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26438 ; GFX11-LABEL: v_round_bf16:
26440 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26441 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26442 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26443 ; GFX11-NEXT: v_trunc_f32_e32 v1, v0
26444 ; GFX11-NEXT: v_sub_f32_e32 v2, v0, v1
26445 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26446 ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
26447 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
26448 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26449 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
26450 ; GFX11-NEXT: v_add_f32_e32 v0, v1, v0
26451 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
26452 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26453 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26454 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26455 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26456 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26457 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26458 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26459 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26460 %op = call bfloat @llvm.round.bf16(bfloat %a)
26464 declare bfloat @llvm.roundeven.bf16(bfloat)
26466 define bfloat @v_roundeven_bf16(bfloat %a) {
26467 ; GCN-LABEL: v_roundeven_bf16:
26469 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26470 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26471 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26472 ; GCN-NEXT: v_rndne_f32_e32 v0, v0
26473 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26474 ; GCN-NEXT: s_setpc_b64 s[30:31]
26476 ; GFX7-LABEL: v_roundeven_bf16:
26478 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26479 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26480 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26481 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0
26482 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26483 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26485 ; GFX8-LABEL: v_roundeven_bf16:
26487 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26488 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26489 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0
26490 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26491 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26492 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26493 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26494 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26495 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26496 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26497 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26499 ; GFX9-LABEL: v_roundeven_bf16:
26501 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26502 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26503 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0
26504 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26505 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26506 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26507 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26508 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26509 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26510 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26511 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26513 ; GFX10-LABEL: v_roundeven_bf16:
26515 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26516 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26517 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0
26518 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26519 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26520 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26521 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26522 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26523 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26524 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26526 ; GFX11-LABEL: v_roundeven_bf16:
26528 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26529 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26530 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26531 ; GFX11-NEXT: v_rndne_f32_e32 v0, v0
26532 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26533 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26534 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26535 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26536 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26537 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26539 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26540 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26541 %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
26545 declare bfloat @llvm.floor.bf16(bfloat)
26547 define bfloat @v_floor_bf16(bfloat %a) {
26548 ; GCN-LABEL: v_floor_bf16:
26550 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26551 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26552 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26553 ; GCN-NEXT: v_floor_f32_e32 v0, v0
26554 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26555 ; GCN-NEXT: s_setpc_b64 s[30:31]
26557 ; GFX7-LABEL: v_floor_bf16:
26559 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26560 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26561 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26562 ; GFX7-NEXT: v_floor_f32_e32 v0, v0
26563 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26564 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26566 ; GFX8-LABEL: v_floor_bf16:
26568 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26569 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26570 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
26571 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26572 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26573 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26574 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26575 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26576 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26577 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26578 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26580 ; GFX9-LABEL: v_floor_bf16:
26582 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26583 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26584 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
26585 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26586 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26587 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26588 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26589 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26590 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26591 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26592 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26594 ; GFX10-LABEL: v_floor_bf16:
26596 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26597 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26598 ; GFX10-NEXT: v_floor_f32_e32 v0, v0
26599 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26600 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26601 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26602 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26603 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26604 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26605 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26607 ; GFX11-LABEL: v_floor_bf16:
26609 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26610 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26611 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26612 ; GFX11-NEXT: v_floor_f32_e32 v0, v0
26613 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26614 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26615 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26616 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26617 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26618 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26619 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26620 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26621 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26622 %op = call bfloat @llvm.floor.bf16(bfloat %a)
26626 declare bfloat @llvm.canonicalize.bf16(bfloat)
26628 define bfloat @v_canonicalize_bf16(bfloat %a) {
26629 ; GCN-LABEL: v_canonicalize_bf16:
26631 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26632 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26633 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26634 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26635 ; GCN-NEXT: s_setpc_b64 s[30:31]
26637 ; GFX7-LABEL: v_canonicalize_bf16:
26639 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26640 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26641 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26642 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26643 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26645 ; GFX8-LABEL: v_canonicalize_bf16:
26647 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26648 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26649 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
26650 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26651 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26652 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26653 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26654 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26655 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26656 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26657 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26659 ; GFX9-LABEL: v_canonicalize_bf16:
26661 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26662 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26663 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
26664 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26665 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26666 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26667 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26668 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26669 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26670 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26671 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26673 ; GFX10-LABEL: v_canonicalize_bf16:
26675 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26676 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26677 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
26678 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26679 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26680 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26681 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26682 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26683 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26684 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26686 ; GFX11-LABEL: v_canonicalize_bf16:
26688 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26689 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26690 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26691 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
26692 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26693 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26694 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26695 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26696 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26697 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26698 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26699 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26700 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26701 %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
26705 declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
26707 ; FIXME: Promotion broken
26708 ; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
26709 ; %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
26713 define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
26714 ; GCN-LABEL: v_fcmp_false_bf16:
26716 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26717 ; GCN-NEXT: v_mov_b32_e32 v0, 0
26718 ; GCN-NEXT: s_setpc_b64 s[30:31]
26720 ; GFX7-LABEL: v_fcmp_false_bf16:
26722 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26723 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
26724 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26726 ; GFX8-LABEL: v_fcmp_false_bf16:
26728 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26729 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
26730 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26732 ; GFX9-LABEL: v_fcmp_false_bf16:
26734 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26735 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
26736 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26738 ; GFX10-LABEL: v_fcmp_false_bf16:
26740 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26741 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
26742 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26744 ; GFX11-LABEL: v_fcmp_false_bf16:
26746 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26747 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
26748 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26749 %op = fcmp false bfloat %a, %b
26753 define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
26754 ; GCN-LABEL: v_fcmp_oeq_bf16:
26756 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26757 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26758 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26759 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26760 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26761 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26762 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26763 ; GCN-NEXT: s_setpc_b64 s[30:31]
26765 ; GFX7-LABEL: v_fcmp_oeq_bf16:
26767 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26768 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26769 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26770 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26771 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26772 ; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26773 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26774 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26776 ; GFX8-LABEL: v_fcmp_oeq_bf16:
26778 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26779 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26780 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26781 ; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26782 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26783 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26785 ; GFX9-LABEL: v_fcmp_oeq_bf16:
26787 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26788 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26789 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26790 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26791 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26792 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26794 ; GFX10-LABEL: v_fcmp_oeq_bf16:
26796 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26797 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26798 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26799 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
26800 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26801 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26803 ; GFX11-LABEL: v_fcmp_oeq_bf16:
26805 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26806 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26807 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26808 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26809 ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
26810 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26811 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26812 %op = fcmp oeq bfloat %a, %b
26816 define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
26817 ; GCN-LABEL: v_fcmp_ogt_bf16:
26819 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26820 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26821 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26822 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26823 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26824 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26825 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26826 ; GCN-NEXT: s_setpc_b64 s[30:31]
26828 ; GFX7-LABEL: v_fcmp_ogt_bf16:
26830 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26831 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26832 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26833 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26834 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26835 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26836 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26837 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26839 ; GFX8-LABEL: v_fcmp_ogt_bf16:
26841 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26842 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26843 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26844 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26845 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26846 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26848 ; GFX9-LABEL: v_fcmp_ogt_bf16:
26850 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26851 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26852 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26853 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26854 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26855 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26857 ; GFX10-LABEL: v_fcmp_ogt_bf16:
26859 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26860 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26861 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26862 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
26863 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26864 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26866 ; GFX11-LABEL: v_fcmp_ogt_bf16:
26868 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26869 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26870 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26871 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26872 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
26873 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26874 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26875 %op = fcmp ogt bfloat %a, %b
26879 define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
26880 ; GCN-LABEL: v_fcmp_oge_bf16:
26882 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26883 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26884 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26885 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26886 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26887 ; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26888 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26889 ; GCN-NEXT: s_setpc_b64 s[30:31]
26891 ; GFX7-LABEL: v_fcmp_oge_bf16:
26893 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26894 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26895 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26896 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26897 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26898 ; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26899 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26900 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26902 ; GFX8-LABEL: v_fcmp_oge_bf16:
26904 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26905 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26906 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26907 ; GFX8-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26908 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26909 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26911 ; GFX9-LABEL: v_fcmp_oge_bf16:
26913 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26914 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26915 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26916 ; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26917 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26918 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26920 ; GFX10-LABEL: v_fcmp_oge_bf16:
26922 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26923 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26924 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26925 ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
26926 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26927 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26929 ; GFX11-LABEL: v_fcmp_oge_bf16:
26931 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26932 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26933 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26934 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26935 ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
26936 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26937 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26938 %op = fcmp oge bfloat %a, %b
26942 define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
26943 ; GCN-LABEL: v_fcmp_olt_bf16:
26945 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26946 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26947 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26948 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26949 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26950 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26951 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26952 ; GCN-NEXT: s_setpc_b64 s[30:31]
26954 ; GFX7-LABEL: v_fcmp_olt_bf16:
26956 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26957 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26958 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26959 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26960 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26961 ; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26962 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26963 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26965 ; GFX8-LABEL: v_fcmp_olt_bf16:
26967 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26968 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26969 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26970 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26971 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26972 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26974 ; GFX9-LABEL: v_fcmp_olt_bf16:
26976 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26977 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26978 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26979 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26980 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26981 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26983 ; GFX10-LABEL: v_fcmp_olt_bf16:
26985 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26986 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26987 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26988 ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
26989 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26990 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26992 ; GFX11-LABEL: v_fcmp_olt_bf16:
26994 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26995 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26996 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26998 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
26999 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27000 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27001 %op = fcmp olt bfloat %a, %b
27005 define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
27006 ; GCN-LABEL: v_fcmp_ole_bf16:
27008 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27009 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27010 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27011 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27012 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27013 ; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27014 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27015 ; GCN-NEXT: s_setpc_b64 s[30:31]
27017 ; GFX7-LABEL: v_fcmp_ole_bf16:
27019 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27020 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27021 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27022 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27023 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27024 ; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27025 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27026 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27028 ; GFX8-LABEL: v_fcmp_ole_bf16:
27030 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27031 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27032 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27033 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27034 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27035 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27037 ; GFX9-LABEL: v_fcmp_ole_bf16:
27039 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27040 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27041 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27042 ; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27043 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27044 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27046 ; GFX10-LABEL: v_fcmp_ole_bf16:
27048 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27049 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27050 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27051 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
27052 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27053 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27055 ; GFX11-LABEL: v_fcmp_ole_bf16:
27057 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27058 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27059 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27060 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27061 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
27062 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27063 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27064 %op = fcmp ole bfloat %a, %b
27068 define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
27069 ; GCN-LABEL: v_fcmp_one_bf16:
27071 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27072 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27073 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27074 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27075 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27076 ; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27077 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27078 ; GCN-NEXT: s_setpc_b64 s[30:31]
27080 ; GFX7-LABEL: v_fcmp_one_bf16:
27082 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27083 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27084 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27085 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27086 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27087 ; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27088 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27089 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27091 ; GFX8-LABEL: v_fcmp_one_bf16:
27093 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27094 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27095 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27096 ; GFX8-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27097 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27098 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27100 ; GFX9-LABEL: v_fcmp_one_bf16:
27102 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27103 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27104 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27105 ; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27106 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27107 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27109 ; GFX10-LABEL: v_fcmp_one_bf16:
27111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27112 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27113 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27114 ; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
27115 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27116 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27118 ; GFX11-LABEL: v_fcmp_one_bf16:
27120 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27121 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27122 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27123 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27124 ; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
27125 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27126 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27127 %op = fcmp one bfloat %a, %b
27131 define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
27132 ; GCN-LABEL: v_fcmp_uno_bf16:
27134 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27135 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27136 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27137 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27138 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27139 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27140 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27141 ; GCN-NEXT: s_setpc_b64 s[30:31]
27143 ; GFX7-LABEL: v_fcmp_uno_bf16:
27145 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27146 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27147 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27148 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27149 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27150 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27151 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27152 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27154 ; GFX8-LABEL: v_fcmp_uno_bf16:
27156 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27157 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27158 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27159 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27160 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27161 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27163 ; GFX9-LABEL: v_fcmp_uno_bf16:
27165 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27166 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27167 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27168 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27169 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27170 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27172 ; GFX10-LABEL: v_fcmp_uno_bf16:
27174 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27175 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27176 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27177 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
27178 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27179 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27181 ; GFX11-LABEL: v_fcmp_uno_bf16:
27183 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27184 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27185 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27186 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27187 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
27188 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27189 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27190 %op = fcmp uno bfloat %a, %b
27194 define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
27195 ; GCN-LABEL: v_fcmp_ueq_bf16:
27197 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27198 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27199 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27200 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27201 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27202 ; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27203 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27204 ; GCN-NEXT: s_setpc_b64 s[30:31]
27206 ; GFX7-LABEL: v_fcmp_ueq_bf16:
27208 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27209 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27210 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27211 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27212 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27213 ; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27214 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27215 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27217 ; GFX8-LABEL: v_fcmp_ueq_bf16:
27219 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27220 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27221 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27222 ; GFX8-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27223 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27224 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27226 ; GFX9-LABEL: v_fcmp_ueq_bf16:
27228 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27229 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27230 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27231 ; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27232 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27233 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27235 ; GFX10-LABEL: v_fcmp_ueq_bf16:
27237 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27238 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27239 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27240 ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
27241 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27242 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27244 ; GFX11-LABEL: v_fcmp_ueq_bf16:
27246 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27247 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27248 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27249 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27250 ; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
27251 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27252 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27253 %op = fcmp ueq bfloat %a, %b
27257 define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
27258 ; GCN-LABEL: v_fcmp_ugt_bf16:
27260 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27261 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27262 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27263 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27264 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27265 ; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27266 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27267 ; GCN-NEXT: s_setpc_b64 s[30:31]
27269 ; GFX7-LABEL: v_fcmp_ugt_bf16:
27271 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27272 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27273 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27274 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27275 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27276 ; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27277 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27278 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27280 ; GFX8-LABEL: v_fcmp_ugt_bf16:
27282 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27283 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27284 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27285 ; GFX8-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27286 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27287 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27289 ; GFX9-LABEL: v_fcmp_ugt_bf16:
27291 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27292 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27293 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27294 ; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27295 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27296 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27298 ; GFX10-LABEL: v_fcmp_ugt_bf16:
27300 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27301 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27302 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27303 ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
27304 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27305 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27307 ; GFX11-LABEL: v_fcmp_ugt_bf16:
27309 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27310 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27311 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27312 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27313 ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
27314 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27315 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27316 %op = fcmp ugt bfloat %a, %b
27320 define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
27321 ; GCN-LABEL: v_fcmp_uge_bf16:
27323 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27324 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27325 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27326 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27327 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27328 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27329 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27330 ; GCN-NEXT: s_setpc_b64 s[30:31]
27332 ; GFX7-LABEL: v_fcmp_uge_bf16:
27334 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27335 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27336 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27337 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27338 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27339 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27340 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27341 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27343 ; GFX8-LABEL: v_fcmp_uge_bf16:
27345 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27346 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27347 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27348 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27349 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27350 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27352 ; GFX9-LABEL: v_fcmp_uge_bf16:
27354 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27355 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27356 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27357 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27358 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27359 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27361 ; GFX10-LABEL: v_fcmp_uge_bf16:
27363 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27364 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27365 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27366 ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
27367 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27368 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27370 ; GFX11-LABEL: v_fcmp_uge_bf16:
27372 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27373 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27374 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27375 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27376 ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
27377 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27378 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27379 %op = fcmp uge bfloat %a, %b
27383 define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
27384 ; GCN-LABEL: v_fcmp_ult_bf16:
27386 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27387 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27388 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27389 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27390 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27391 ; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27392 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27393 ; GCN-NEXT: s_setpc_b64 s[30:31]
27395 ; GFX7-LABEL: v_fcmp_ult_bf16:
27397 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27398 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27399 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27400 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27401 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27402 ; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27403 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27404 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27406 ; GFX8-LABEL: v_fcmp_ult_bf16:
27408 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27409 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27410 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27411 ; GFX8-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27412 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27413 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27415 ; GFX9-LABEL: v_fcmp_ult_bf16:
27417 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27418 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27419 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27420 ; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27421 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27422 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27424 ; GFX10-LABEL: v_fcmp_ult_bf16:
27426 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27427 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27428 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27429 ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
27430 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27431 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27433 ; GFX11-LABEL: v_fcmp_ult_bf16:
27435 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27436 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27437 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27438 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27439 ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
27440 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27441 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27442 %op = fcmp ult bfloat %a, %b
27446 define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
27447 ; GCN-LABEL: v_fcmp_ule_bf16:
27449 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27450 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27451 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27452 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27453 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27454 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27455 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27456 ; GCN-NEXT: s_setpc_b64 s[30:31]
27458 ; GFX7-LABEL: v_fcmp_ule_bf16:
27460 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27461 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27462 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27463 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27464 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27465 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27466 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27467 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27469 ; GFX8-LABEL: v_fcmp_ule_bf16:
27471 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27472 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27473 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27474 ; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27475 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27476 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27478 ; GFX9-LABEL: v_fcmp_ule_bf16:
27480 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27481 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27482 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27483 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27484 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27485 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27487 ; GFX10-LABEL: v_fcmp_ule_bf16:
27489 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27490 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27491 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27492 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
27493 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27494 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27496 ; GFX11-LABEL: v_fcmp_ule_bf16:
27498 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27499 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27500 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27501 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27502 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
27503 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27504 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27505 %op = fcmp ule bfloat %a, %b
27509 define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
27510 ; GCN-LABEL: v_fcmp_une_bf16:
27512 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27513 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27514 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27515 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27516 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27517 ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27518 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27519 ; GCN-NEXT: s_setpc_b64 s[30:31]
27521 ; GFX7-LABEL: v_fcmp_une_bf16:
27523 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27524 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27525 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27526 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27527 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27528 ; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27529 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27530 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27532 ; GFX8-LABEL: v_fcmp_une_bf16:
27534 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27535 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27536 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27537 ; GFX8-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27538 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27539 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27541 ; GFX9-LABEL: v_fcmp_une_bf16:
27543 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27544 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27545 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27546 ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27547 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27548 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27550 ; GFX10-LABEL: v_fcmp_une_bf16:
27552 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27553 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27554 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27555 ; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
27556 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27557 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27559 ; GFX11-LABEL: v_fcmp_une_bf16:
27561 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27562 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27563 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27564 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27565 ; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
27566 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27567 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27568 %op = fcmp une bfloat %a, %b
27572 define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
27573 ; GCN-LABEL: v_fcmp_true_bf16:
27575 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27576 ; GCN-NEXT: v_mov_b32_e32 v0, 1
27577 ; GCN-NEXT: s_setpc_b64 s[30:31]
27579 ; GFX7-LABEL: v_fcmp_true_bf16:
27581 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27582 ; GFX7-NEXT: v_mov_b32_e32 v0, 1
27583 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27585 ; GFX8-LABEL: v_fcmp_true_bf16:
27587 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27588 ; GFX8-NEXT: v_mov_b32_e32 v0, 1
27589 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27591 ; GFX9-LABEL: v_fcmp_true_bf16:
27593 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27594 ; GFX9-NEXT: v_mov_b32_e32 v0, 1
27595 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27597 ; GFX10-LABEL: v_fcmp_true_bf16:
27599 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27600 ; GFX10-NEXT: v_mov_b32_e32 v0, 1
27601 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27603 ; GFX11-LABEL: v_fcmp_true_bf16:
27605 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27606 ; GFX11-NEXT: v_mov_b32_e32 v0, 1
27607 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27608 %op = fcmp true bfloat %a, %b
27612 declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
27614 define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
27615 ; GCN-LABEL: v_copysign_bf16_bf16:
27617 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27618 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27619 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27620 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27621 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27622 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27623 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27624 ; GCN-NEXT: s_setpc_b64 s[30:31]
27626 ; GFX7-LABEL: v_copysign_bf16_bf16:
27628 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27629 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27630 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27631 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27632 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27633 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27634 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27635 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27637 ; GFX8-LABEL: v_copysign_bf16_bf16:
27639 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27640 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27641 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27642 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27644 ; GFX9-LABEL: v_copysign_bf16_bf16:
27646 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27647 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27648 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27649 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27651 ; GFX10-LABEL: v_copysign_bf16_bf16:
27653 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27654 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27655 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27657 ; GFX11-LABEL: v_copysign_bf16_bf16:
27659 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27660 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27661 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27662 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27666 define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
27667 ; GCN-LABEL: v_copysign_bf16_s_bf16:
27669 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27670 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27671 ; GCN-NEXT: s_and_b32 s4, s16, 0x80000000
27672 ; GCN-NEXT: s_lshr_b32 s4, s4, 16
27673 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27674 ; GCN-NEXT: v_or_b32_e32 v0, s4, v0
27675 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27676 ; GCN-NEXT: s_setpc_b64 s[30:31]
27678 ; GFX7-LABEL: v_copysign_bf16_s_bf16:
27680 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27681 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27682 ; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000
27683 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16
27684 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27685 ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0
27686 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27687 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27689 ; GFX8-LABEL: v_copysign_bf16_s_bf16:
27691 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27692 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27693 ; GFX8-NEXT: v_mov_b32_e32 v1, s16
27694 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27695 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27697 ; GFX9-LABEL: v_copysign_bf16_s_bf16:
27699 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27700 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27701 ; GFX9-NEXT: v_mov_b32_e32 v1, s16
27702 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27703 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27705 ; GFX10-LABEL: v_copysign_bf16_s_bf16:
27707 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27708 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s16
27709 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27711 ; GFX11-LABEL: v_copysign_bf16_s_bf16:
27713 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27714 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
27715 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27716 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27720 define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
27721 ; GCN-LABEL: v_copysign_s_bf16_bf16:
27723 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27724 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16
27725 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
27726 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
27727 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
27728 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
27729 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27730 ; GCN-NEXT: s_setpc_b64 s[30:31]
27732 ; GFX7-LABEL: v_copysign_s_bf16_bf16:
27734 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27735 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16
27736 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
27737 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
27738 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
27739 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
27740 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27741 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27743 ; GFX8-LABEL: v_copysign_s_bf16_bf16:
27745 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27746 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27747 ; GFX8-NEXT: v_mov_b32_e32 v1, s16
27748 ; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0
27749 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27751 ; GFX9-LABEL: v_copysign_s_bf16_bf16:
27753 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27754 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27755 ; GFX9-NEXT: v_mov_b32_e32 v1, s16
27756 ; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
27757 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27759 ; GFX10-LABEL: v_copysign_s_bf16_bf16:
27761 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27762 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s16, v0
27763 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27765 ; GFX11-LABEL: v_copysign_s_bf16_bf16:
27767 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27768 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
27769 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27770 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27774 define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
27775 ; GCN-LABEL: v_copysign_bf16_f32:
27777 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27778 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27779 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27780 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27781 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27782 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27783 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27784 ; GCN-NEXT: s_setpc_b64 s[30:31]
27786 ; GFX7-LABEL: v_copysign_bf16_f32:
27788 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27789 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27790 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27791 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27792 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27793 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27794 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27795 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27797 ; GFX8-LABEL: v_copysign_bf16_f32:
27799 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27800 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27801 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27802 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27803 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27805 ; GFX9-LABEL: v_copysign_bf16_f32:
27807 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27808 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27809 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27810 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27811 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27813 ; GFX10-LABEL: v_copysign_bf16_f32:
27815 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27816 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27817 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27818 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27820 ; GFX11-LABEL: v_copysign_bf16_f32:
27822 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27823 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27824 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27825 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27826 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27827 %sign = fptrunc float %sign.f32 to bfloat
27828 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27832 define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
27833 ; GCN-LABEL: v_copysign_bf16_f64:
27835 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27836 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27837 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2
27838 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27839 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27840 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27841 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27842 ; GCN-NEXT: s_setpc_b64 s[30:31]
27844 ; GFX7-LABEL: v_copysign_bf16_f64:
27846 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27847 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27848 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2
27849 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27850 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27851 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27852 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27853 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27855 ; GFX8-LABEL: v_copysign_bf16_f64:
27857 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27858 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27859 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27860 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27861 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27863 ; GFX9-LABEL: v_copysign_bf16_f64:
27865 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27866 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27867 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27868 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27869 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27871 ; GFX10-LABEL: v_copysign_bf16_f64:
27873 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27874 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27875 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27876 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27878 ; GFX11-LABEL: v_copysign_bf16_f64:
27880 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27881 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27882 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27883 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27884 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27885 %sign = fptrunc double %sign.f64 to bfloat
27886 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27890 define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
27891 ; GCN-LABEL: v_copysign_bf16_f16:
27893 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27894 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27895 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
27896 ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1
27897 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27898 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27899 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27900 ; GCN-NEXT: s_setpc_b64 s[30:31]
27902 ; GFX7-LABEL: v_copysign_bf16_f16:
27904 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27905 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
27906 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27907 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27908 ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1
27909 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27910 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27911 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27913 ; GFX8-LABEL: v_copysign_bf16_f16:
27915 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27916 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27917 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27918 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27920 ; GFX9-LABEL: v_copysign_bf16_f16:
27922 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27923 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27924 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27925 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27927 ; GFX10-LABEL: v_copysign_bf16_f16:
27929 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27930 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27931 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27933 ; GFX11-LABEL: v_copysign_bf16_f16:
27935 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27936 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27937 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27938 %sign = bitcast half %sign.f16 to bfloat
27939 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27943 define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
27944 ; GCN-LABEL: s_copysign_bf16_bf16:
27946 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
27947 ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
27948 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
27949 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27950 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0
27951 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
27952 ; GCN-NEXT: ; return to shader part epilog
27954 ; GFX7-LABEL: s_copysign_bf16_bf16:
27956 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
27957 ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
27958 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
27959 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27960 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
27961 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
27962 ; GFX7-NEXT: ; return to shader part epilog
27964 ; GFX8-LABEL: s_copysign_bf16_bf16:
27966 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff
27967 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
27968 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
27969 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
27970 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
27971 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
27972 ; GFX8-NEXT: ; return to shader part epilog
27974 ; GFX9-LABEL: s_copysign_bf16_bf16:
27976 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
27977 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
27978 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
27979 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
27980 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
27981 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
27982 ; GFX9-NEXT: ; return to shader part epilog
27984 ; GFX10-LABEL: s_copysign_bf16_bf16:
27986 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
27987 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
27988 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
27989 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
27990 ; GFX10-NEXT: ; return to shader part epilog
27992 ; GFX11-LABEL: s_copysign_bf16_bf16:
27994 ; GFX11-NEXT: v_mov_b32_e32 v0, s1
27995 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
27996 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
27997 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
27998 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27999 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28000 ; GFX11-NEXT: ; return to shader part epilog
28001 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28002 %cast = bitcast bfloat %op to i16
28003 %zext = zext i16 %cast to i32
28004 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28008 define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
28009 ; GCN-LABEL: s_copysign_bf16_f32:
28011 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
28012 ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
28013 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
28014 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
28015 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0
28016 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28017 ; GCN-NEXT: ; return to shader part epilog
28019 ; GFX7-LABEL: s_copysign_bf16_f32:
28021 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
28022 ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
28023 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
28024 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
28025 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
28026 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28027 ; GFX7-NEXT: ; return to shader part epilog
28029 ; GFX8-LABEL: s_copysign_bf16_f32:
28031 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28032 ; GFX8-NEXT: s_movk_i32 s1, 0x7fff
28033 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
28034 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
28035 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28036 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28037 ; GFX8-NEXT: ; return to shader part epilog
28039 ; GFX9-LABEL: s_copysign_bf16_f32:
28041 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28042 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff
28043 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28044 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
28045 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28046 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28047 ; GFX9-NEXT: ; return to shader part epilog
28049 ; GFX10-LABEL: s_copysign_bf16_f32:
28051 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28052 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28053 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28054 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28055 ; GFX10-NEXT: ; return to shader part epilog
28057 ; GFX11-LABEL: s_copysign_bf16_f32:
28059 ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28060 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28061 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28062 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28063 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28064 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28065 ; GFX11-NEXT: ; return to shader part epilog
28066 %sign = fptrunc float %sign.f32 to bfloat
28067 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28068 %cast = bitcast bfloat %op to i16
28069 %zext = zext i16 %cast to i32
28070 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28074 define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
28075 ; GCN-LABEL: s_copysign_bf16_f64:
28077 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
28078 ; GCN-NEXT: s_and_b32 s0, s2, 0x80000000
28079 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
28080 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
28081 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0
28082 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28083 ; GCN-NEXT: ; return to shader part epilog
28085 ; GFX7-LABEL: s_copysign_bf16_f64:
28087 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
28088 ; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000
28089 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
28090 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
28091 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
28092 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28093 ; GFX7-NEXT: ; return to shader part epilog
28095 ; GFX8-LABEL: s_copysign_bf16_f64:
28097 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28098 ; GFX8-NEXT: s_movk_i32 s1, 0x7fff
28099 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
28100 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
28101 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28102 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28103 ; GFX8-NEXT: ; return to shader part epilog
28105 ; GFX9-LABEL: s_copysign_bf16_f64:
28107 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28108 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff
28109 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28110 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
28111 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28112 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28113 ; GFX9-NEXT: ; return to shader part epilog
28115 ; GFX10-LABEL: s_copysign_bf16_f64:
28117 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28118 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28119 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28120 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28121 ; GFX10-NEXT: ; return to shader part epilog
28123 ; GFX11-LABEL: s_copysign_bf16_f64:
28125 ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28127 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28128 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28130 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28131 ; GFX11-NEXT: ; return to shader part epilog
28132 %sign = fptrunc double %sign.f64 to bfloat
28133 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28134 %cast = bitcast bfloat %op to i16
28135 %zext = zext i16 %cast to i32
28136 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28140 define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
28141 ; GCN-LABEL: s_copysign_bf16_f16:
28143 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
28144 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1
28145 ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1
28146 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
28147 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
28148 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28149 ; GCN-NEXT: ; return to shader part epilog
28151 ; GFX7-LABEL: s_copysign_bf16_f16:
28153 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1
28154 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
28155 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
28156 ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
28157 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
28158 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28159 ; GFX7-NEXT: ; return to shader part epilog
28161 ; GFX8-LABEL: s_copysign_bf16_f16:
28163 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff
28164 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
28165 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
28166 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
28167 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28168 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28169 ; GFX8-NEXT: ; return to shader part epilog
28171 ; GFX9-LABEL: s_copysign_bf16_f16:
28173 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
28174 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
28175 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
28176 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
28177 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28178 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28179 ; GFX9-NEXT: ; return to shader part epilog
28181 ; GFX10-LABEL: s_copysign_bf16_f16:
28183 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
28184 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28185 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28186 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28187 ; GFX10-NEXT: ; return to shader part epilog
28189 ; GFX11-LABEL: s_copysign_bf16_f16:
28191 ; GFX11-NEXT: v_mov_b32_e32 v0, s1
28192 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28193 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28194 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28195 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28196 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28197 ; GFX11-NEXT: ; return to shader part epilog
28198 %sign = bitcast half %sign.f16 to bfloat
28199 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28200 %cast = bitcast bfloat %op to i16
28201 %zext = zext i16 %cast to i32
28202 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28206 declare float @llvm.copysign.f32(float, float)
28208 define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
28209 ; GCN-LABEL: v_copysign_f32_bf16:
28211 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28212 ; GCN-NEXT: s_brev_b32 s4, -2
28213 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
28214 ; GCN-NEXT: s_setpc_b64 s[30:31]
28216 ; GFX7-LABEL: v_copysign_f32_bf16:
28218 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28219 ; GFX7-NEXT: s_brev_b32 s4, -2
28220 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
28221 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28223 ; GFX8-LABEL: v_copysign_f32_bf16:
28225 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28226 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28227 ; GFX8-NEXT: s_brev_b32 s4, -2
28228 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
28229 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28231 ; GFX9-LABEL: v_copysign_f32_bf16:
28233 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28234 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28235 ; GFX9-NEXT: s_brev_b32 s4, -2
28236 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
28237 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28239 ; GFX10-LABEL: v_copysign_f32_bf16:
28241 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28242 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28243 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
28244 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28246 ; GFX11-LABEL: v_copysign_f32_bf16:
28248 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28249 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28250 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28251 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
28252 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28253 %sign = fpext bfloat %sign.bf16 to float
28254 %op = call float @llvm.copysign.f32(float %mag, float %sign)
28258 define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
28259 ; GCN-LABEL: s_copysign_f32_bf16:
28261 ; GCN-NEXT: s_brev_b32 s2, -2
28262 ; GCN-NEXT: v_mov_b32_e32 v0, s0
28263 ; GCN-NEXT: v_mov_b32_e32 v1, s1
28264 ; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1
28265 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28266 ; GCN-NEXT: ; return to shader part epilog
28268 ; GFX7-LABEL: s_copysign_f32_bf16:
28270 ; GFX7-NEXT: s_brev_b32 s2, -2
28271 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
28272 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
28273 ; GFX7-NEXT: v_bfi_b32 v0, s2, v0, v1
28274 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28275 ; GFX7-NEXT: ; return to shader part epilog
28277 ; GFX8-LABEL: s_copysign_f32_bf16:
28279 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28280 ; GFX8-NEXT: s_brev_b32 s1, -2
28281 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
28282 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
28283 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28284 ; GFX8-NEXT: ; return to shader part epilog
28286 ; GFX9-LABEL: s_copysign_f32_bf16:
28288 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28289 ; GFX9-NEXT: s_brev_b32 s1, -2
28290 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28291 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
28292 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28293 ; GFX9-NEXT: ; return to shader part epilog
28295 ; GFX10-LABEL: s_copysign_f32_bf16:
28297 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28298 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
28299 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28300 ; GFX10-NEXT: ; return to shader part epilog
28302 ; GFX11-LABEL: s_copysign_f32_bf16:
28304 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28305 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28306 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
28307 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28308 ; GFX11-NEXT: ; return to shader part epilog
28309 %sign = fpext bfloat %sign.bf16 to float
28310 %op = call float @llvm.copysign.f32(float %mag, float %sign)
28311 %cast = bitcast float %op to i32
28312 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
28316 declare half @llvm.copysign.f16(half, half)
28318 define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
28319 ; GCN-LABEL: v_copysign_f16_bf16:
28321 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28322 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28323 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
28324 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28325 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
28326 ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
28327 ; GCN-NEXT: s_brev_b32 s4, -2
28328 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
28329 ; GCN-NEXT: s_setpc_b64 s[30:31]
28331 ; GFX7-LABEL: v_copysign_f16_bf16:
28333 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28334 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
28335 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28336 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28337 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
28338 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
28339 ; GFX7-NEXT: s_brev_b32 s4, -2
28340 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
28341 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28343 ; GFX8-LABEL: v_copysign_f16_bf16:
28345 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28346 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
28347 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
28348 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28350 ; GFX9-LABEL: v_copysign_f16_bf16:
28352 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28353 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
28354 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
28355 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28357 ; GFX10-LABEL: v_copysign_f16_bf16:
28359 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28360 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
28361 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28363 ; GFX11-LABEL: v_copysign_f16_bf16:
28365 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28366 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
28367 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28368 %sign = bitcast bfloat %sign.bf16 to half
28369 %op = call half @llvm.copysign.f16(half %mag, half %sign)
28373 define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
28374 ; GCN-LABEL: s_copysign_f16_bf16:
28376 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
28377 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0
28378 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
28379 ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
28380 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
28381 ; GCN-NEXT: s_brev_b32 s0, -2
28382 ; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0
28383 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
28384 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28385 ; GCN-NEXT: ; return to shader part epilog
28387 ; GFX7-LABEL: s_copysign_f16_bf16:
28389 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0
28390 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
28391 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28392 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
28393 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
28394 ; GFX7-NEXT: s_brev_b32 s0, -2
28395 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
28396 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
28397 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28398 ; GFX7-NEXT: ; return to shader part epilog
28400 ; GFX8-LABEL: s_copysign_f16_bf16:
28402 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff
28403 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
28404 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
28405 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
28406 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28407 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28408 ; GFX8-NEXT: ; return to shader part epilog
28410 ; GFX9-LABEL: s_copysign_f16_bf16:
28412 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
28413 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
28414 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
28415 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
28416 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28417 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28418 ; GFX9-NEXT: ; return to shader part epilog
28420 ; GFX10-LABEL: s_copysign_f16_bf16:
28422 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
28423 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28424 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28425 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28426 ; GFX10-NEXT: ; return to shader part epilog
28428 ; GFX11-LABEL: s_copysign_f16_bf16:
28430 ; GFX11-NEXT: v_mov_b32_e32 v0, s1
28431 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28432 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28433 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28434 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28435 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28436 ; GFX11-NEXT: ; return to shader part epilog
28437 %sign = bitcast bfloat %sign.bf16 to half
28438 %op = call half @llvm.copysign.f16(half %mag, half %sign)
28439 %cast = bitcast half %op to i16
28440 %zext = zext i16 %cast to i32
28441 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28445 declare double @llvm.copysign.f64(double, double)
28447 define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
28448 ; GCN-LABEL: v_copysign_f64_bf16:
28450 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28451 ; GCN-NEXT: s_brev_b32 s4, -2
28452 ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2
28453 ; GCN-NEXT: s_setpc_b64 s[30:31]
28455 ; GFX7-LABEL: v_copysign_f64_bf16:
28457 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28458 ; GFX7-NEXT: s_brev_b32 s4, -2
28459 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
28460 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28462 ; GFX8-LABEL: v_copysign_f64_bf16:
28464 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28465 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28466 ; GFX8-NEXT: s_brev_b32 s4, -2
28467 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
28468 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28470 ; GFX9-LABEL: v_copysign_f64_bf16:
28472 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28473 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28474 ; GFX9-NEXT: s_brev_b32 s4, -2
28475 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
28476 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28478 ; GFX10-LABEL: v_copysign_f64_bf16:
28480 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28481 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28482 ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
28483 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28485 ; GFX11-LABEL: v_copysign_f64_bf16:
28487 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28488 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28489 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28490 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
28491 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28492 %sign = fpext bfloat %sign.bf16 to double
28493 %op = call double @llvm.copysign.f64(double %mag, double %sign)
28497 define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
28498 ; GCN-LABEL: s_copysign_f64_bf16:
28500 ; GCN-NEXT: s_brev_b32 s3, -2
28501 ; GCN-NEXT: v_mov_b32_e32 v0, s1
28502 ; GCN-NEXT: v_mov_b32_e32 v1, s2
28503 ; GCN-NEXT: v_bfi_b32 v0, s3, v0, v1
28504 ; GCN-NEXT: v_readfirstlane_b32 s1, v0
28505 ; GCN-NEXT: ; return to shader part epilog
28507 ; GFX7-LABEL: s_copysign_f64_bf16:
28509 ; GFX7-NEXT: s_brev_b32 s3, -2
28510 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
28511 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
28512 ; GFX7-NEXT: v_bfi_b32 v0, s3, v0, v1
28513 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0
28514 ; GFX7-NEXT: ; return to shader part epilog
28516 ; GFX8-LABEL: s_copysign_f64_bf16:
28518 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28519 ; GFX8-NEXT: s_brev_b32 s2, -2
28520 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
28521 ; GFX8-NEXT: v_bfi_b32 v0, s2, v1, v0
28522 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0
28523 ; GFX8-NEXT: ; return to shader part epilog
28525 ; GFX9-LABEL: s_copysign_f64_bf16:
28527 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28528 ; GFX9-NEXT: s_brev_b32 s2, -2
28529 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
28530 ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0
28531 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0
28532 ; GFX9-NEXT: ; return to shader part epilog
28534 ; GFX10-LABEL: s_copysign_f64_bf16:
28536 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28537 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
28538 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0
28539 ; GFX10-NEXT: ; return to shader part epilog
28541 ; GFX11-LABEL: s_copysign_f64_bf16:
28543 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28544 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28545 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
28546 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
28547 ; GFX11-NEXT: ; return to shader part epilog
28548 %sign = fpext bfloat %sign.bf16 to double
28549 %op = call double @llvm.copysign.f64(double %mag, double %sign)
28550 %cast = bitcast double %op to <2 x i32>
28551 %cast.0 = extractelement <2 x i32> %cast, i32 0
28552 %cast.1 = extractelement <2 x i32> %cast, i32 1
28553 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
28554 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
28555 %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
28556 %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
28557 ret <2 x i32> %ins.1
28560 define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
28561 ; GCN-LABEL: v_fptosi_bf16_to_i16:
28563 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28564 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28565 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28566 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28567 ; GCN-NEXT: s_setpc_b64 s[30:31]
28569 ; GFX7-LABEL: v_fptosi_bf16_to_i16:
28571 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28572 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28573 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28574 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28575 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28577 ; GFX8-LABEL: v_fptosi_bf16_to_i16:
28579 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28580 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28581 ; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
28582 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28584 ; GFX9-LABEL: v_fptosi_bf16_to_i16:
28586 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28587 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28588 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28589 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28591 ; GFX10-LABEL: v_fptosi_bf16_to_i16:
28593 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28594 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28595 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28596 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28598 ; GFX11-LABEL: v_fptosi_bf16_to_i16:
28600 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28601 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28602 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28603 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
28604 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28605 %op = fptosi bfloat %x to i16
28609 define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
28610 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i16:
28612 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28613 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28614 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28615 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28616 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28617 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28618 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28619 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28620 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
28621 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2
28622 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
28623 ; GCN-NEXT: s_setpc_b64 s[30:31]
28625 ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16:
28627 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28628 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28629 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28630 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28631 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28632 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
28633 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28634 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28635 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
28636 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
28637 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
28638 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28640 ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16:
28642 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28643 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
28644 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28645 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28646 ; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
28647 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28648 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28650 ; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
28652 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28653 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
28654 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28655 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
28656 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28657 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
28658 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
28659 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28661 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
28663 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28664 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
28665 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28666 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
28667 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28668 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
28669 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28671 ; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
28672 ; GFX11TRUE16: ; %bb.0:
28673 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28674 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
28675 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28676 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28677 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28678 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28679 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
28680 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
28681 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
28683 ; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
28684 ; GFX11FAKE16: ; %bb.0:
28685 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28686 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
28687 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28688 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28689 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28690 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28691 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
28692 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
28693 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
28694 %op = fptosi <2 x bfloat> %x to <2 x i16>
28698 define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
28699 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i16:
28701 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28702 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
28703 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28704 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28705 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28706 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28707 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28708 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28709 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28710 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2
28711 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28712 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
28713 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3
28714 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
28715 ; GCN-NEXT: v_alignbit_b32 v1, v3, v1, 16
28716 ; GCN-NEXT: s_setpc_b64 s[30:31]
28718 ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16:
28720 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28721 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28722 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28723 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
28724 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28725 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28726 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
28727 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28728 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28729 ; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v2
28730 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28731 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
28732 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
28733 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3
28734 ; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16
28735 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28737 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
28739 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28740 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28741 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28742 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
28743 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28744 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28745 ; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1
28746 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28747 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28749 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
28751 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28752 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28753 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28754 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
28755 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28756 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28757 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
28758 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
28759 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
28760 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28762 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
28764 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28765 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28766 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28767 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28768 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
28769 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28770 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
28771 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
28772 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28774 ; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
28775 ; GFX11TRUE16: ; %bb.0:
28776 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28777 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
28778 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28779 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28780 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28781 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28782 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28783 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28784 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28785 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
28786 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
28788 ; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
28789 ; GFX11FAKE16: ; %bb.0:
28790 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28791 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28792 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28793 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28794 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28795 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28796 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28797 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28798 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28799 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
28800 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
28801 %op = fptosi <3 x bfloat> %x to <3 x i16>
28805 define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
28806 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i16:
28808 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28809 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
28810 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
28811 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28812 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28813 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28814 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28815 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
28816 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28817 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28818 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28819 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
28820 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
28821 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28822 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
28823 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
28824 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
28825 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
28826 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4
28827 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
28828 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
28829 ; GCN-NEXT: s_setpc_b64 s[30:31]
28831 ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16:
28833 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28834 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
28835 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
28836 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28837 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28838 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
28839 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28840 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28841 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28842 ; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
28843 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
28844 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
28845 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28846 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
28847 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
28848 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28849 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
28850 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
28851 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
28852 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
28853 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
28854 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28856 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
28858 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28859 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28860 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28861 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28862 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28863 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
28864 ; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v3
28865 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28866 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28867 ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28868 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28869 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28871 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
28873 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28874 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28875 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28876 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28877 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28878 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
28879 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
28880 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28881 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
28882 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
28883 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
28884 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
28885 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28887 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
28889 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28890 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28891 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28892 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28893 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28894 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
28895 ; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v3
28896 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28897 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
28898 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
28899 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
28900 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28902 ; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
28903 ; GFX11TRUE16: ; %bb.0:
28904 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28905 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
28906 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28907 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
28908 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28909 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28910 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28911 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28912 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28913 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
28914 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28915 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28916 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
28917 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
28918 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
28920 ; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
28921 ; GFX11FAKE16: ; %bb.0:
28922 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28923 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28924 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28925 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28926 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28927 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28928 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28929 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
28930 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28931 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28932 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28933 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28934 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
28935 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
28936 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
28937 %op = fptosi <4 x bfloat> %x to <4 x i16>
28941 define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
28942 ; GCN-LABEL: v_fptosi_bf16_to_i32:
28944 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28945 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28946 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28947 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28948 ; GCN-NEXT: s_setpc_b64 s[30:31]
28950 ; GFX7-LABEL: v_fptosi_bf16_to_i32:
28952 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28953 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28954 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28955 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28956 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28958 ; GFX8-LABEL: v_fptosi_bf16_to_i32:
28960 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28961 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28962 ; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
28963 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28965 ; GFX9-LABEL: v_fptosi_bf16_to_i32:
28967 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28968 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28969 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28970 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28972 ; GFX10-LABEL: v_fptosi_bf16_to_i32:
28974 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28975 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28976 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28977 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28979 ; GFX11-LABEL: v_fptosi_bf16_to_i32:
28981 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28982 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28984 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
28985 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28986 %op = fptosi bfloat %x to i32
28990 define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
28991 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i32:
28993 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28994 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28995 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28996 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28997 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28998 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28999 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
29000 ; GCN-NEXT: s_setpc_b64 s[30:31]
29002 ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32:
29004 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29005 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29006 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29007 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29008 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29009 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
29010 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
29011 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29013 ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32:
29015 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29016 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29017 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v1
29018 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29019 ; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v0
29020 ; GFX8-NEXT: v_mov_b32_e32 v0, v2
29021 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29023 ; GFX9-LABEL: v_fptosi_v2bf16_to_v2i32:
29025 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29026 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29027 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v1
29028 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29029 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v0
29030 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
29031 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29033 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32:
29035 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29036 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29037 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
29038 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v1
29039 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v2
29040 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29042 ; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32:
29044 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29045 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29046 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
29047 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29048 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1
29049 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2
29050 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29051 %op = fptosi <2 x bfloat> %x to <2 x i32>
29055 define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
29056 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i32:
29058 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29059 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29060 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29061 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29062 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29063 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29064 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29065 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
29066 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
29067 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
29068 ; GCN-NEXT: s_setpc_b64 s[30:31]
29070 ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32:
29072 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29073 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29074 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29075 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29076 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29077 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29078 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29079 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
29080 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
29081 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
29082 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29084 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32:
29086 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29087 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29088 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29089 ; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
29090 ; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
29091 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29092 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
29093 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
29094 ; GFX8-NEXT: v_mov_b32_e32 v1, v3
29095 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29097 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32:
29099 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29100 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29101 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29102 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
29103 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
29104 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29105 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
29106 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
29107 ; GFX9-NEXT: v_mov_b32_e32 v1, v3
29108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29110 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32:
29112 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29113 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29114 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29115 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29116 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
29117 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
29118 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
29119 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29121 ; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32:
29123 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29124 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29125 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29126 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29127 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29128 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
29129 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
29130 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
29131 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
29132 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29133 %op = fptosi <3 x bfloat> %x to <3 x i32>
29137 define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
29138 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i32:
29140 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29141 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
29142 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29143 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29144 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29145 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29146 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29147 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29148 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
29149 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
29150 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
29151 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
29152 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
29153 ; GCN-NEXT: s_setpc_b64 s[30:31]
29155 ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32:
29157 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29158 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
29159 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29160 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29161 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29162 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29163 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29164 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29165 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
29166 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
29167 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
29168 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
29169 ; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
29170 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29172 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32:
29174 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29175 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29176 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29177 ; GFX8-NEXT: v_cvt_i32_f32_e32 v5, v0
29178 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29179 ; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
29180 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
29181 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
29182 ; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
29183 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
29184 ; GFX8-NEXT: v_mov_b32_e32 v1, v5
29185 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29187 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32:
29189 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29190 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29191 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29192 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v0
29193 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29194 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
29195 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
29196 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
29197 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
29198 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
29199 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
29200 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29202 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32:
29204 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29205 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29206 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29207 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29208 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
29209 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
29210 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
29211 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
29212 ; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v5
29213 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29215 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32:
29217 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29218 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29219 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29220 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29221 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
29222 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29223 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
29224 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
29225 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29226 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
29227 ; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5
29228 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29229 %op = fptosi <4 x bfloat> %x to <4 x i32>
29233 define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
29234 ; GCN-LABEL: v_fptosi_bf16_to_i64:
29236 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29237 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29238 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29239 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29240 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29241 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29242 ; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4
29243 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
29244 ; GCN-NEXT: v_floor_f32_e32 v1, v1
29245 ; GCN-NEXT: v_fma_f32 v0, v1, s5, |v0|
29246 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29247 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29248 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
29249 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
29250 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
29251 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
29252 ; GCN-NEXT: s_setpc_b64 s[30:31]
29254 ; GFX7-LABEL: v_fptosi_bf16_to_i64:
29256 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29257 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29258 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29259 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29260 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29261 ; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4
29262 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29263 ; GFX7-NEXT: s_mov_b32 s4, 0xcf800000
29264 ; GFX7-NEXT: v_fma_f32 v2, v1, s4, |v0|
29265 ; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2
29266 ; GFX7-NEXT: v_cvt_u32_f32_e32 v1, v1
29267 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29268 ; GFX7-NEXT: v_xor_b32_e32 v0, v2, v3
29269 ; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
29270 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
29271 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
29272 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29274 ; GFX8-LABEL: v_fptosi_bf16_to_i64:
29276 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29277 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29278 ; GFX8-NEXT: v_trunc_f32_e32 v0, v0
29279 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29280 ; GFX8-NEXT: v_mul_f32_e64 v1, |v0|, s4
29281 ; GFX8-NEXT: v_floor_f32_e32 v1, v1
29282 ; GFX8-NEXT: s_mov_b32 s4, 0xcf800000
29283 ; GFX8-NEXT: v_fma_f32 v2, v1, s4, |v0|
29284 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
29285 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
29286 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29287 ; GFX8-NEXT: v_xor_b32_e32 v0, v2, v3
29288 ; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
29289 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
29290 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
29291 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29293 ; GFX9-LABEL: v_fptosi_bf16_to_i64:
29295 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29296 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29297 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0
29298 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29299 ; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4
29300 ; GFX9-NEXT: v_floor_f32_e32 v1, v1
29301 ; GFX9-NEXT: s_mov_b32 s4, 0xcf800000
29302 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
29303 ; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0|
29304 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
29305 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29306 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3
29307 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3
29308 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
29309 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
29310 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29312 ; GFX10-LABEL: v_fptosi_bf16_to_i64:
29314 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29315 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29316 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
29317 ; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
29318 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29319 ; GFX10-NEXT: v_floor_f32_e32 v1, v1
29320 ; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
29321 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
29322 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v2
29323 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3
29324 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v3
29325 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
29326 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
29327 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29329 ; GFX11-LABEL: v_fptosi_bf16_to_i64:
29331 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29332 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29333 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29334 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
29335 ; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
29336 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29337 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
29338 ; GFX11-NEXT: v_floor_f32_e32 v1, v1
29339 ; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
29340 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
29341 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29342 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2
29343 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
29344 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
29345 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3
29346 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
29347 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
29348 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
29349 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29350 %op = fptosi bfloat %x to i64
29354 define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
29355 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i64:
29357 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29358 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29359 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29360 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29361 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29362 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29363 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29364 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29365 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
29366 ; GCN-NEXT: v_mul_f32_e64 v2, |v0|, s4
29367 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29368 ; GCN-NEXT: v_mul_f32_e64 v4, |v1|, s4
29369 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
29370 ; GCN-NEXT: v_floor_f32_e32 v2, v2
29371 ; GCN-NEXT: v_floor_f32_e32 v4, v4
29372 ; GCN-NEXT: v_fma_f32 v0, v2, s5, |v0|
29373 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
29374 ; GCN-NEXT: v_fma_f32 v1, v4, s5, |v1|
29375 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
29376 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29377 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v3
29378 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29379 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v5
29380 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v3
29381 ; GCN-NEXT: v_xor_b32_e32 v6, v1, v5
29382 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
29383 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc
29384 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v6, v5
29385 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc
29386 ; GCN-NEXT: s_setpc_b64 s[30:31]
29388 ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64:
29390 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29391 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29392 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29393 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29394 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29395 ; GFX7-NEXT: v_mul_f32_e64 v2, |v0|, s4
29396 ; GFX7-NEXT: v_floor_f32_e32 v2, v2
29397 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
29398 ; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0|
29399 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
29400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29401 ; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0
29402 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29403 ; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4
29404 ; GFX7-NEXT: v_trunc_f32_e32 v3, v1
29405 ; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4
29406 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29407 ; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2
29408 ; GFX7-NEXT: v_fma_f32 v5, v1, s5, |v3|
29409 ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
29410 ; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v1
29411 ; GFX7-NEXT: v_xor_b32_e32 v2, v2, v4
29412 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
29413 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3
29414 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc
29415 ; GFX7-NEXT: v_xor_b32_e32 v2, v5, v3
29416 ; GFX7-NEXT: v_xor_b32_e32 v4, v6, v3
29417 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
29418 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29419 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29421 ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i64:
29423 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29424 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29425 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1
29426 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29427 ; GFX8-NEXT: v_mul_f32_e64 v2, |v1|, s4
29428 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29429 ; GFX8-NEXT: v_floor_f32_e32 v2, v2
29430 ; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
29431 ; GFX8-NEXT: v_trunc_f32_e32 v4, v0
29432 ; GFX8-NEXT: v_fma_f32 v3, v2, s5, |v1|
29433 ; GFX8-NEXT: v_mul_f32_e64 v0, |v4|, s4
29434 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
29435 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
29436 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
29437 ; GFX8-NEXT: v_fma_f32 v5, v0, s5, |v4|
29438 ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
29439 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29440 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
29441 ; GFX8-NEXT: v_xor_b32_e32 v3, v3, v1
29442 ; GFX8-NEXT: v_xor_b32_e32 v2, v2, v1
29443 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v3, v1
29444 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4
29445 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
29446 ; GFX8-NEXT: v_xor_b32_e32 v2, v5, v3
29447 ; GFX8-NEXT: v_xor_b32_e32 v4, v6, v3
29448 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
29449 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29450 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29452 ; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
29454 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29455 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29456 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
29457 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29458 ; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4
29459 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29460 ; GFX9-NEXT: v_floor_f32_e32 v2, v2
29461 ; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
29462 ; GFX9-NEXT: v_trunc_f32_e32 v4, v0
29463 ; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1|
29464 ; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4
29465 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
29466 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
29467 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
29468 ; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4|
29469 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
29470 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29471 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
29472 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1
29473 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1
29474 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1
29475 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4
29476 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
29477 ; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3
29478 ; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3
29479 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
29480 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
29481 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29483 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
29485 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29486 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29487 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29488 ; GFX10-NEXT: v_trunc_f32_e32 v1, v1
29489 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
29490 ; GFX10-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
29491 ; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
29492 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v0
29493 ; GFX10-NEXT: v_floor_f32_e32 v2, v2
29494 ; GFX10-NEXT: v_floor_f32_e32 v3, v3
29495 ; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
29496 ; GFX10-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
29497 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29498 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
29499 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
29500 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v4
29501 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v5
29502 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1
29503 ; GFX10-NEXT: v_xor_b32_e32 v3, v3, v6
29504 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
29505 ; GFX10-NEXT: v_xor_b32_e32 v4, v4, v6
29506 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
29507 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
29508 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
29509 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
29510 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29512 ; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64:
29514 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29515 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29516 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29517 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29518 ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
29519 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
29520 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29521 ; GFX11-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
29522 ; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
29523 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v0
29524 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29525 ; GFX11-NEXT: v_floor_f32_e32 v2, v2
29526 ; GFX11-NEXT: v_floor_f32_e32 v3, v3
29527 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29528 ; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
29529 ; GFX11-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
29530 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29531 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
29532 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
29533 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v4
29534 ; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v5
29535 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29536 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1
29537 ; GFX11-NEXT: v_xor_b32_e32 v3, v3, v6
29538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29539 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
29540 ; GFX11-NEXT: v_xor_b32_e32 v4, v4, v6
29541 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
29542 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
29543 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
29544 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
29545 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
29546 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29547 %op = fptosi <2 x bfloat> %x to <2 x i64>
29551 define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
29552 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i64:
29554 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29555 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29556 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29557 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29558 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29559 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29560 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29561 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29562 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29563 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29564 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
29565 ; GCN-NEXT: v_trunc_f32_e32 v2, v2
29566 ; GCN-NEXT: v_mul_f32_e64 v3, |v0|, s4
29567 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
29568 ; GCN-NEXT: v_mul_f32_e64 v5, |v1|, s4
29569 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
29570 ; GCN-NEXT: v_mul_f32_e64 v7, |v2|, s4
29571 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v2
29572 ; GCN-NEXT: v_floor_f32_e32 v3, v3
29573 ; GCN-NEXT: v_floor_f32_e32 v5, v5
29574 ; GCN-NEXT: v_floor_f32_e32 v7, v7
29575 ; GCN-NEXT: v_fma_f32 v0, v3, s5, |v0|
29576 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
29577 ; GCN-NEXT: v_fma_f32 v1, v5, s5, |v1|
29578 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
29579 ; GCN-NEXT: v_fma_f32 v2, v7, s5, |v2|
29580 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7
29581 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29582 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v4
29583 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29584 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v6
29585 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
29586 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v8
29587 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
29588 ; GCN-NEXT: v_xor_b32_e32 v9, v1, v6
29589 ; GCN-NEXT: v_xor_b32_e32 v10, v2, v8
29590 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
29591 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc
29592 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v9, v6
29593 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v6, vcc
29594 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, v10, v8
29595 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, v7, v8, vcc
29596 ; GCN-NEXT: s_setpc_b64 s[30:31]
29598 ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64:
29600 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29601 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29602 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29603 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29604 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29605 ; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
29606 ; GFX7-NEXT: v_floor_f32_e32 v3, v3
29607 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
29608 ; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0|
29609 ; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4
29610 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29611 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0
29612 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29613 ; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5
29614 ; GFX7-NEXT: v_trunc_f32_e32 v4, v1
29615 ; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4
29616 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
29617 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29618 ; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4|
29619 ; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6
29620 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29621 ; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5
29622 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
29623 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1
29624 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc
29625 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4
29626 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29627 ; GFX7-NEXT: v_xor_b32_e32 v5, v6, v3
29628 ; GFX7-NEXT: v_trunc_f32_e32 v6, v2
29629 ; GFX7-NEXT: v_mul_f32_e64 v2, |v6|, s4
29630 ; GFX7-NEXT: v_floor_f32_e32 v2, v2
29631 ; GFX7-NEXT: v_xor_b32_e32 v4, v7, v3
29632 ; GFX7-NEXT: v_fma_f32 v7, v2, s5, |v6|
29633 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
29634 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v2
29635 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v5, v3
29636 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v6
29637 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29638 ; GFX7-NEXT: v_xor_b32_e32 v4, v7, v5
29639 ; GFX7-NEXT: v_xor_b32_e32 v6, v8, v5
29640 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5
29641 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
29642 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29644 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64:
29646 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29647 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29648 ; GFX8-NEXT: v_trunc_f32_e32 v2, v2
29649 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29650 ; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4
29651 ; GFX8-NEXT: v_floor_f32_e32 v3, v3
29652 ; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
29653 ; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
29654 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29655 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
29656 ; GFX8-NEXT: v_trunc_f32_e32 v5, v0
29657 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
29658 ; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
29659 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
29660 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
29661 ; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
29662 ; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
29663 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v6
29664 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29665 ; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
29666 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v0
29667 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
29668 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1
29669 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v3, v2, vcc
29670 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29671 ; GFX8-NEXT: v_mul_f32_e64 v5, |v1|, s4
29672 ; GFX8-NEXT: v_floor_f32_e32 v5, v5
29673 ; GFX8-NEXT: v_xor_b32_e32 v2, v7, v3
29674 ; GFX8-NEXT: v_fma_f32 v7, v5, s5, |v1|
29675 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
29676 ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
29677 ; GFX8-NEXT: v_xor_b32_e32 v4, v8, v3
29678 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
29679 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29680 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29681 ; GFX8-NEXT: v_xor_b32_e32 v4, v7, v1
29682 ; GFX8-NEXT: v_xor_b32_e32 v5, v5, v1
29683 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v1
29684 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
29685 ; GFX8-NEXT: v_mov_b32_e32 v1, v6
29686 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29688 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
29690 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29691 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29692 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
29693 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29694 ; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
29695 ; GFX9-NEXT: v_floor_f32_e32 v3, v3
29696 ; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
29697 ; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
29698 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29699 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
29700 ; GFX9-NEXT: v_trunc_f32_e32 v5, v0
29701 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
29702 ; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
29703 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
29704 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
29705 ; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
29706 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
29707 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
29708 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29709 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
29710 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0
29711 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
29712 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
29713 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
29714 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29715 ; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4
29716 ; GFX9-NEXT: v_floor_f32_e32 v5, v5
29717 ; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3
29718 ; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1|
29719 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
29720 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
29721 ; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3
29722 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
29723 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29724 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
29725 ; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1
29726 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1
29727 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
29728 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
29729 ; GFX9-NEXT: v_mov_b32_e32 v1, v6
29730 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29732 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
29734 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29735 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29736 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29737 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29738 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2
29739 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
29740 ; GFX10-NEXT: v_trunc_f32_e32 v1, v1
29741 ; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
29742 ; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
29743 ; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
29744 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
29745 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
29746 ; GFX10-NEXT: v_floor_f32_e32 v3, v3
29747 ; GFX10-NEXT: v_floor_f32_e32 v4, v4
29748 ; GFX10-NEXT: v_floor_f32_e32 v6, v6
29749 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v1
29750 ; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
29751 ; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
29752 ; GFX10-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
29753 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
29754 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
29755 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
29756 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
29757 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
29758 ; GFX10-NEXT: v_xor_b32_e32 v3, v3, v5
29759 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
29760 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
29761 ; GFX10-NEXT: v_xor_b32_e32 v9, v0, v7
29762 ; GFX10-NEXT: v_xor_b32_e32 v4, v4, v7
29763 ; GFX10-NEXT: v_xor_b32_e32 v10, v1, v8
29764 ; GFX10-NEXT: v_xor_b32_e32 v6, v6, v8
29765 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
29766 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
29767 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
29768 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
29769 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
29770 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
29771 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29773 ; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64:
29775 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29776 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29777 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29778 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29779 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29780 ; GFX11-NEXT: v_trunc_f32_e32 v2, v2
29781 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
29782 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29783 ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
29784 ; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
29785 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29786 ; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
29787 ; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
29788 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
29789 ; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
29790 ; GFX11-NEXT: v_floor_f32_e32 v3, v3
29791 ; GFX11-NEXT: v_floor_f32_e32 v4, v4
29792 ; GFX11-NEXT: v_floor_f32_e32 v6, v6
29793 ; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v1
29794 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29795 ; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
29796 ; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
29797 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
29798 ; GFX11-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
29799 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
29800 ; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v4
29801 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
29802 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
29803 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
29804 ; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5
29805 ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
29806 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
29807 ; GFX11-NEXT: v_xor_b32_e32 v9, v0, v7
29808 ; GFX11-NEXT: v_xor_b32_e32 v4, v4, v7
29809 ; GFX11-NEXT: v_xor_b32_e32 v10, v1, v8
29810 ; GFX11-NEXT: v_xor_b32_e32 v6, v6, v8
29811 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
29812 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
29813 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
29814 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
29815 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
29816 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
29817 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29818 %op = fptosi <3 x bfloat> %x to <3 x i64>
29822 define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
29823 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i64:
29825 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29826 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
29827 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29828 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29829 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29830 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29831 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29832 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29833 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29834 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29835 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
29836 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29837 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
29838 ; GCN-NEXT: v_trunc_f32_e32 v2, v2
29839 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
29840 ; GCN-NEXT: v_mul_f32_e64 v4, |v0|, s4
29841 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0
29842 ; GCN-NEXT: v_mul_f32_e64 v6, |v1|, s4
29843 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1
29844 ; GCN-NEXT: v_mul_f32_e64 v8, |v2|, s4
29845 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v2
29846 ; GCN-NEXT: v_mul_f32_e64 v10, |v3|, s4
29847 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3
29848 ; GCN-NEXT: v_floor_f32_e32 v4, v4
29849 ; GCN-NEXT: v_floor_f32_e32 v6, v6
29850 ; GCN-NEXT: v_floor_f32_e32 v8, v8
29851 ; GCN-NEXT: v_floor_f32_e32 v10, v10
29852 ; GCN-NEXT: v_fma_f32 v0, v4, s5, |v0|
29853 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
29854 ; GCN-NEXT: v_fma_f32 v1, v6, s5, |v1|
29855 ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
29856 ; GCN-NEXT: v_fma_f32 v2, v8, s5, |v2|
29857 ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8
29858 ; GCN-NEXT: v_fma_f32 v3, v10, s5, |v3|
29859 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
29860 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29861 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v5
29862 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29863 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v7
29864 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
29865 ; GCN-NEXT: v_xor_b32_e32 v8, v8, v9
29866 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
29867 ; GCN-NEXT: v_xor_b32_e32 v10, v10, v11
29868 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v5
29869 ; GCN-NEXT: v_xor_b32_e32 v12, v1, v7
29870 ; GCN-NEXT: v_xor_b32_e32 v13, v2, v9
29871 ; GCN-NEXT: v_xor_b32_e32 v14, v3, v11
29872 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
29873 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc
29874 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v12, v7
29875 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v6, v7, vcc
29876 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, v13, v9
29877 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, v8, v9, vcc
29878 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, v14, v11
29879 ; GCN-NEXT: v_subb_u32_e32 v7, vcc, v10, v11, vcc
29880 ; GCN-NEXT: s_setpc_b64 s[30:31]
29882 ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64:
29884 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29885 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29886 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29887 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29888 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29889 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3
29890 ; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
29891 ; GFX7-NEXT: v_floor_f32_e32 v3, v3
29892 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
29893 ; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0|
29894 ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
29895 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29896 ; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0
29897 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29898 ; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6
29899 ; GFX7-NEXT: v_trunc_f32_e32 v5, v1
29900 ; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4
29901 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
29902 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29903 ; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5|
29904 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
29905 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29906 ; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6
29907 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
29908 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1
29909 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
29910 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29911 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29912 ; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3
29913 ; GFX7-NEXT: v_trunc_f32_e32 v7, v2
29914 ; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4
29915 ; GFX7-NEXT: v_floor_f32_e32 v2, v2
29916 ; GFX7-NEXT: v_xor_b32_e32 v5, v8, v3
29917 ; GFX7-NEXT: v_fma_f32 v8, v2, s5, |v7|
29918 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v8
29919 ; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v2
29920 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3
29921 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
29922 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7
29923 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
29924 ; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5
29925 ; GFX7-NEXT: v_trunc_f32_e32 v8, v4
29926 ; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4
29927 ; GFX7-NEXT: v_floor_f32_e32 v4, v4
29928 ; GFX7-NEXT: v_xor_b32_e32 v6, v9, v5
29929 ; GFX7-NEXT: v_fma_f32 v9, v4, s5, |v8|
29930 ; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v9
29931 ; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v4
29932 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v7, v5
29933 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v8
29934 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
29935 ; GFX7-NEXT: v_xor_b32_e32 v6, v9, v7
29936 ; GFX7-NEXT: v_xor_b32_e32 v8, v10, v7
29937 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
29938 ; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc
29939 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29941 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64:
29943 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29944 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29945 ; GFX8-NEXT: v_trunc_f32_e32 v2, v2
29946 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29947 ; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4
29948 ; GFX8-NEXT: v_floor_f32_e32 v3, v3
29949 ; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
29950 ; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
29951 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29952 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
29953 ; GFX8-NEXT: v_trunc_f32_e32 v5, v0
29954 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
29955 ; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
29956 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
29957 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
29958 ; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
29959 ; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
29960 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
29961 ; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
29962 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v0
29963 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
29964 ; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v3, v2, vcc
29965 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29966 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
29967 ; GFX8-NEXT: v_trunc_f32_e32 v5, v5
29968 ; GFX8-NEXT: v_xor_b32_e32 v2, v6, v3
29969 ; GFX8-NEXT: v_mul_f32_e64 v6, |v5|, s4
29970 ; GFX8-NEXT: v_floor_f32_e32 v6, v6
29971 ; GFX8-NEXT: v_xor_b32_e32 v4, v7, v3
29972 ; GFX8-NEXT: v_fma_f32 v7, v6, s5, |v5|
29973 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
29974 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29975 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
29976 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
29977 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1
29978 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29979 ; GFX8-NEXT: v_xor_b32_e32 v4, v7, v5
29980 ; GFX8-NEXT: v_mul_f32_e64 v7, |v1|, s4
29981 ; GFX8-NEXT: v_floor_f32_e32 v7, v7
29982 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
29983 ; GFX8-NEXT: v_fma_f32 v9, v7, s5, |v1|
29984 ; GFX8-NEXT: v_cvt_u32_f32_e32 v9, v9
29985 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
29986 ; GFX8-NEXT: v_xor_b32_e32 v6, v6, v5
29987 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
29988 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29989 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
29990 ; GFX8-NEXT: v_xor_b32_e32 v6, v9, v1
29991 ; GFX8-NEXT: v_xor_b32_e32 v7, v7, v1
29992 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v1
29993 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v1, vcc
29994 ; GFX8-NEXT: v_mov_b32_e32 v1, v8
29995 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29997 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
29999 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30000 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
30001 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
30002 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
30003 ; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
30004 ; GFX9-NEXT: v_floor_f32_e32 v3, v3
30005 ; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
30006 ; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
30007 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30008 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
30009 ; GFX9-NEXT: v_trunc_f32_e32 v5, v0
30010 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
30011 ; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
30012 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
30013 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
30014 ; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
30015 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
30016 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
30017 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
30018 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
30019 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
30020 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
30021 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
30022 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
30023 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
30024 ; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3
30025 ; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4
30026 ; GFX9-NEXT: v_floor_f32_e32 v6, v6
30027 ; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3
30028 ; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5|
30029 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
30030 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30031 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
30032 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
30033 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
30034 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
30035 ; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5
30036 ; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4
30037 ; GFX9-NEXT: v_floor_f32_e32 v7, v7
30038 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
30039 ; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1|
30040 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9
30041 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
30042 ; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5
30043 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
30044 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
30045 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
30046 ; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1
30047 ; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1
30048 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
30049 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
30050 ; GFX9-NEXT: v_mov_b32_e32 v1, v8
30051 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30053 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
30055 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30056 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
30057 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30058 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
30059 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30060 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2
30061 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
30062 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
30063 ; GFX10-NEXT: v_trunc_f32_e32 v4, v1
30064 ; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
30065 ; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
30066 ; GFX10-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
30067 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
30068 ; GFX10-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
30069 ; GFX10-NEXT: v_floor_f32_e32 v1, v1
30070 ; GFX10-NEXT: v_floor_f32_e32 v6, v6
30071 ; GFX10-NEXT: v_floor_f32_e32 v8, v8
30072 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
30073 ; GFX10-NEXT: v_floor_f32_e32 v9, v9
30074 ; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
30075 ; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
30076 ; GFX10-NEXT: v_ashrrev_i32_e32 v10, 31, v3
30077 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
30078 ; GFX10-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
30079 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
30080 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
30081 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
30082 ; GFX10-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
30083 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5
30084 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
30085 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v3
30086 ; GFX10-NEXT: v_xor_b32_e32 v3, v0, v7
30087 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
30088 ; GFX10-NEXT: v_xor_b32_e32 v6, v6, v7
30089 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
30090 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
30091 ; GFX10-NEXT: v_ashrrev_i32_e32 v13, 31, v4
30092 ; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v9
30093 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
30094 ; GFX10-NEXT: v_xor_b32_e32 v4, v12, v10
30095 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
30096 ; GFX10-NEXT: v_xor_b32_e32 v5, v8, v10
30097 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
30098 ; GFX10-NEXT: v_xor_b32_e32 v6, v11, v13
30099 ; GFX10-NEXT: v_xor_b32_e32 v7, v9, v13
30100 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
30101 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
30102 ; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
30103 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
30104 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30106 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64:
30108 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30109 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
30110 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30111 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
30112 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30113 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30114 ; GFX11-NEXT: v_trunc_f32_e32 v2, v2
30115 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
30116 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30117 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
30118 ; GFX11-NEXT: v_trunc_f32_e32 v4, v1
30119 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30120 ; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
30121 ; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
30122 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
30123 ; GFX11-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
30124 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
30125 ; GFX11-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
30126 ; GFX11-NEXT: v_floor_f32_e32 v1, v1
30127 ; GFX11-NEXT: v_floor_f32_e32 v6, v6
30128 ; GFX11-NEXT: v_floor_f32_e32 v8, v8
30129 ; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
30130 ; GFX11-NEXT: v_floor_f32_e32 v9, v9
30131 ; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
30132 ; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
30133 ; GFX11-NEXT: v_ashrrev_i32_e32 v10, 31, v3
30134 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
30135 ; GFX11-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
30136 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
30137 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
30138 ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
30139 ; GFX11-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
30140 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
30141 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
30142 ; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v3
30143 ; GFX11-NEXT: v_xor_b32_e32 v3, v0, v7
30144 ; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8
30145 ; GFX11-NEXT: v_xor_b32_e32 v6, v6, v7
30146 ; GFX11-NEXT: v_cvt_u32_f32_e32 v11, v11
30147 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
30148 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v4
30149 ; GFX11-NEXT: v_cvt_u32_f32_e32 v9, v9
30150 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
30151 ; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10
30152 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
30153 ; GFX11-NEXT: v_xor_b32_e32 v5, v8, v10
30154 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
30155 ; GFX11-NEXT: v_xor_b32_e32 v6, v11, v13
30156 ; GFX11-NEXT: v_xor_b32_e32 v7, v9, v13
30157 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
30158 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
30159 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30160 ; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
30161 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
30162 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30163 %op = fptosi <4 x bfloat> %x to <4 x i64>
30167 define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
30168 ; GCN-LABEL: v_sitofp_i16_to_bf16:
30170 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30171 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30172 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30173 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30174 ; GCN-NEXT: s_setpc_b64 s[30:31]
30176 ; GFX7-LABEL: v_sitofp_i16_to_bf16:
30178 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30179 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30180 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30181 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30182 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30184 ; GFX8-LABEL: v_sitofp_i16_to_bf16:
30186 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30187 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30188 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
30189 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
30190 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
30191 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
30192 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30193 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
30194 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30195 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30197 ; GFX9-LABEL: v_sitofp_i16_to_bf16:
30199 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30200 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30201 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30202 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
30203 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
30204 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
30205 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30206 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
30207 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30208 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30210 ; GFX10-LABEL: v_sitofp_i16_to_bf16:
30212 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30213 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30214 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
30215 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
30216 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30217 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30218 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30219 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30220 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30222 ; GFX11-LABEL: v_sitofp_i16_to_bf16:
30224 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30225 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
30226 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
30227 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30228 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
30229 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
30230 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30231 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
30232 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30233 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30235 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30236 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30237 %op = sitofp i16 %x to bfloat
30241 define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
30242 ; GCN-LABEL: v_sitofp_v2i16_to_v2bf16:
30244 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30245 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30246 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
30247 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30248 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30249 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30250 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30251 ; GCN-NEXT: s_setpc_b64 s[30:31]
30253 ; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16:
30255 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30256 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30257 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
30258 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30259 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30260 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30261 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30262 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30264 ; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
30266 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30267 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30268 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30269 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30270 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30271 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30272 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
30273 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30274 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
30275 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
30276 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
30277 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
30278 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
30279 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30280 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30281 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30282 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
30283 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30285 ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
30287 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30288 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30289 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30290 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30291 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
30292 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
30293 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
30294 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30295 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
30296 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
30297 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
30298 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
30299 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30300 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30301 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
30303 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30305 ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
30307 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30308 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30309 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30310 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
30311 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
30312 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
30313 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30314 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
30315 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
30316 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30317 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
30318 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30319 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
30320 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
30321 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30323 ; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
30325 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30326 ; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16
30327 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30328 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30329 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
30330 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30332 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
30333 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
30334 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
30335 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30336 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
30337 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
30338 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30339 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
30340 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
30341 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30342 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
30343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30344 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
30345 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30346 %op = sitofp <2 x i16> %x to <2 x bfloat>
30347 ret <2 x bfloat> %op
30350 define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
30351 ; GCN-LABEL: v_sitofp_v3i16_to_v3bf16:
30353 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30354 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30355 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
30356 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
30357 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
30358 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30359 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30360 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30361 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30362 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30363 ; GCN-NEXT: s_setpc_b64 s[30:31]
30365 ; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16:
30367 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30368 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30369 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
30370 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
30371 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30372 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30373 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
30374 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30375 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30376 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30377 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30379 ; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
30381 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30382 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30383 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30384 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30385 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30386 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30387 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30388 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
30389 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30390 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
30391 ; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1
30392 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
30393 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30394 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
30395 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
30396 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
30397 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
30398 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
30399 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30400 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
30401 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30402 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
30403 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30404 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30405 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
30406 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30408 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
30410 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30411 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30412 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30413 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30414 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30415 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
30416 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
30417 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
30418 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30419 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
30420 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
30421 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
30422 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
30423 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30424 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
30425 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
30426 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
30427 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
30428 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30429 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
30430 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30431 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
30432 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
30433 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30435 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
30437 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30438 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30439 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30440 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30441 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
30442 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
30443 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
30444 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30445 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
30446 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
30447 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
30448 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
30449 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
30450 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
30451 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30452 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30453 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30454 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30455 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
30456 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30457 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
30458 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30460 ; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16:
30461 ; GFX11TRUE16: ; %bb.0:
30462 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30463 ; GFX11TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 16
30464 ; GFX11TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16
30465 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30466 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30467 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
30468 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
30469 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30470 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
30471 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
30472 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30473 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
30474 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
30475 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
30476 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30477 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
30478 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
30479 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
30480 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
30481 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
30482 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30483 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30484 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30485 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30486 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30487 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30488 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
30489 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
30490 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
30491 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
30493 ; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
30494 ; GFX11FAKE16: ; %bb.0:
30495 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30496 ; GFX11FAKE16-NEXT: v_bfe_i32 v2, v0, 0, 16
30497 ; GFX11FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 16
30498 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30499 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30500 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
30501 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
30502 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30503 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
30504 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
30505 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30506 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
30507 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
30508 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
30509 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30510 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
30511 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
30512 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
30513 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
30514 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
30515 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30516 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30517 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30518 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30519 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30520 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30521 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
30522 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
30523 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
30524 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
30525 %op = sitofp <3 x i16> %x to <3 x bfloat>
30526 ret <3 x bfloat> %op
30529 define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
30530 ; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
30532 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30533 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30534 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
30535 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
30536 ; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16
30537 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
30538 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
30539 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30540 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30541 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30542 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30543 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30544 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
30545 ; GCN-NEXT: s_setpc_b64 s[30:31]
30547 ; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16:
30549 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30550 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30551 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
30552 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
30553 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
30554 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30555 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30556 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
30557 ; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
30558 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30559 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30560 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30561 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
30562 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30564 ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
30566 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30567 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30568 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30569 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30570 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
30571 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
30572 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
30573 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30574 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
30575 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30576 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
30577 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30578 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30579 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
30580 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
30581 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30582 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
30583 ; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1
30584 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30585 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
30586 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
30587 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5
30588 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
30589 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
30590 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
30591 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
30592 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30593 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
30594 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30595 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
30596 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30597 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30598 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
30599 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
30600 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30602 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
30604 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30605 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30606 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30607 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30608 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
30609 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
30610 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
30611 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30612 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
30613 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30614 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30615 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
30616 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
30617 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
30618 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30619 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
30620 ; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
30621 ; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
30622 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
30623 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
30624 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
30625 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
30626 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
30627 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
30628 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30629 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
30630 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30631 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
30632 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
30633 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30635 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
30637 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30638 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30639 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30640 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30641 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30642 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
30643 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
30644 ; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
30645 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30646 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
30647 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30648 ; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
30649 ; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
30650 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
30651 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
30652 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
30653 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
30654 ; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
30655 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
30656 ; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
30657 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
30658 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30659 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
30660 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30661 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
30662 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
30663 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
30664 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30666 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
30668 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30669 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
30670 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
30671 ; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16
30672 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30673 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30674 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
30675 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
30676 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30677 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
30678 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30679 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
30680 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
30681 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
30682 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
30683 ; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
30684 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30685 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30686 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
30687 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
30688 ; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1
30689 ; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
30690 ; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
30691 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
30692 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
30693 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
30694 ; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
30695 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
30696 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30697 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
30698 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
30699 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30700 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
30701 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
30702 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30703 ; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
30704 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30705 %op = sitofp <4 x i16> %x to <4 x bfloat>
30706 ret <4 x bfloat> %op
30709 define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
30710 ; GCN-LABEL: v_sitofp_i32_to_bf16:
30712 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30713 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30714 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30715 ; GCN-NEXT: s_setpc_b64 s[30:31]
30717 ; GFX7-LABEL: v_sitofp_i32_to_bf16:
30719 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30720 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30721 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30722 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30724 ; GFX8-LABEL: v_sitofp_i32_to_bf16:
30726 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30727 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
30728 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
30729 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
30730 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
30731 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
30732 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30733 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
30734 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30735 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30737 ; GFX9-LABEL: v_sitofp_i32_to_bf16:
30739 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30740 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
30741 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30742 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
30743 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
30744 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
30745 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30746 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
30747 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30748 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30750 ; GFX10-LABEL: v_sitofp_i32_to_bf16:
30752 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30753 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
30754 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
30755 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
30756 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30757 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30758 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30759 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30760 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30762 ; GFX11-LABEL: v_sitofp_i32_to_bf16:
30764 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30765 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30766 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30767 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
30768 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
30769 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30770 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30771 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
30772 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30773 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30774 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30775 %op = sitofp i32 %x to bfloat
30779 define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
30780 ; GCN-LABEL: v_sitofp_v2i32_to_v2bf16:
30782 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30783 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30784 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30785 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30786 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30787 ; GCN-NEXT: s_setpc_b64 s[30:31]
30789 ; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16:
30791 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30792 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30793 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30794 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30795 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30796 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30798 ; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
30800 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30801 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
30802 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
30803 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
30804 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
30805 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30806 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
30807 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30808 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
30809 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30810 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30811 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30812 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
30813 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30814 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
30815 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30816 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
30817 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30819 ; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
30821 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30822 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
30823 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
30824 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30825 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
30826 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
30827 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
30828 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30829 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30830 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
30831 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
30832 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
30833 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30834 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
30835 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30836 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
30837 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30839 ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
30841 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30842 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
30843 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
30844 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
30845 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
30846 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
30847 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30848 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
30849 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
30850 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
30851 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
30852 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30853 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
30854 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
30855 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30857 ; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
30859 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30860 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30861 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
30862 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30863 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
30864 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
30865 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
30866 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30867 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
30868 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
30869 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
30870 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
30871 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
30872 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30873 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
30874 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30875 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
30876 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30877 %op = sitofp <2 x i32> %x to <2 x bfloat>
30878 ret <2 x bfloat> %op
30881 define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
30882 ; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
30884 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30885 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
30886 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30887 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30888 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30889 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30890 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30891 ; GCN-NEXT: s_setpc_b64 s[30:31]
30893 ; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16:
30895 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30896 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30897 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30898 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
30899 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30900 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30901 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30902 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30904 ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
30906 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30907 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
30908 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
30909 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
30910 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
30911 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
30912 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30913 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
30914 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30915 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
30916 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
30917 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
30918 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30919 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
30920 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30921 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
30922 ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
30923 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
30924 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30925 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
30926 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30927 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
30928 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
30929 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30930 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
30931 ; GFX8-NEXT: v_mov_b32_e32 v1, v2
30932 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30934 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
30936 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30937 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
30938 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
30939 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30940 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
30941 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
30942 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
30943 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
30944 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30945 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
30946 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
30947 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
30948 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
30949 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30950 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
30951 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
30952 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
30953 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
30954 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30955 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
30956 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30957 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
30958 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
30959 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30961 ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
30963 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30964 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
30965 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
30966 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
30967 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
30968 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
30969 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
30970 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30971 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
30972 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30973 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
30974 ; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
30975 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
30976 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30977 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
30978 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30979 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
30980 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30981 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
30982 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
30983 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
30984 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30986 ; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16:
30987 ; GFX11TRUE16: ; %bb.0:
30988 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30989 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
30990 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
30991 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
30992 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30993 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
30994 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
30995 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
30996 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30997 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
30998 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30999 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
31000 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
31001 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
31002 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31003 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31004 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31005 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
31006 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31007 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
31008 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31009 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
31010 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
31011 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
31013 ; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
31014 ; GFX11FAKE16: ; %bb.0:
31015 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31016 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
31017 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
31018 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
31019 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31020 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
31021 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
31022 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
31023 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31024 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
31025 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31026 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
31027 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
31028 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
31029 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31030 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31031 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31032 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
31033 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31034 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
31035 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31036 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
31037 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
31038 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
31039 %op = sitofp <3 x i32> %x to <3 x bfloat>
31040 ret <3 x bfloat> %op
31043 define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
31044 ; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
31046 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31047 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
31048 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
31049 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
31050 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31051 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31052 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31053 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
31054 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
31055 ; GCN-NEXT: s_setpc_b64 s[30:31]
31057 ; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16:
31059 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31060 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31061 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
31062 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
31063 ; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
31064 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31065 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31066 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
31067 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
31068 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31070 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
31072 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31073 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
31074 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
31075 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31076 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
31077 ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
31078 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
31079 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
31080 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
31081 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
31082 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
31083 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
31084 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
31085 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
31086 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3
31087 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
31088 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
31089 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
31090 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
31091 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
31092 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
31093 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
31094 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31095 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
31096 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
31097 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
31098 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
31099 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
31100 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
31101 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
31102 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
31103 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
31104 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
31105 ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
31106 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31108 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
31110 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31111 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
31112 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
31113 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31114 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31115 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
31116 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
31117 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
31118 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
31119 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
31120 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
31121 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
31122 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
31123 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
31124 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
31125 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
31126 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
31127 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
31128 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
31129 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31130 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
31131 ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
31132 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
31133 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
31134 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
31135 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
31136 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
31137 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
31138 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
31139 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31141 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
31143 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31144 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
31145 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31146 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
31147 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3
31148 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
31149 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
31150 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
31151 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31152 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
31153 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31154 ; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
31155 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
31156 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
31157 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
31158 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
31159 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31160 ; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
31161 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
31162 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
31163 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
31164 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31165 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
31166 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
31167 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31168 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
31169 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
31170 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31172 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
31174 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31175 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
31176 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
31177 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
31178 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
31179 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
31180 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
31181 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
31182 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
31183 ; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1
31184 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31185 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31186 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
31187 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
31188 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
31189 ; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
31190 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
31191 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31192 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
31193 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
31194 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
31195 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31196 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
31197 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
31198 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
31199 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
31200 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31201 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
31202 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
31203 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
31204 ; GFX11-NEXT: s_setpc_b64 s[30:31]
31205 %op = sitofp <4 x i32> %x to <4 x bfloat>
31206 ret <4 x bfloat> %op
31209 define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
31210 ; GCN-LABEL: v_sitofp_i64_to_bf16:
31212 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31213 ; GCN-NEXT: v_xor_b32_e32 v2, v0, v1
31214 ; GCN-NEXT: v_ffbh_i32_e32 v3, v1
31215 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31216 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31217 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 32, v2
31218 ; GCN-NEXT: v_min_u32_e32 v2, v3, v2
31219 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
31220 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
31221 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
31222 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31223 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
31224 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
31225 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31226 ; GCN-NEXT: s_setpc_b64 s[30:31]
31228 ; GFX7-LABEL: v_sitofp_i64_to_bf16:
31230 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31231 ; GFX7-NEXT: v_xor_b32_e32 v2, v0, v1
31232 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31233 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
31234 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v2
31235 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31236 ; GFX7-NEXT: v_min_u32_e32 v2, v3, v2
31237 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
31238 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
31239 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
31240 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31241 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
31242 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
31243 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31244 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31246 ; GFX8-LABEL: v_sitofp_i64_to_bf16:
31248 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31249 ; GFX8-NEXT: v_xor_b32_e32 v2, v0, v1
31250 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31251 ; GFX8-NEXT: v_ffbh_i32_e32 v3, v1
31252 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v2
31253 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, -1, v3
31254 ; GFX8-NEXT: v_min_u32_e32 v2, v3, v2
31255 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31256 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31257 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31258 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31259 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
31260 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
31261 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
31262 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
31263 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
31264 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
31265 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31266 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
31267 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31268 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31270 ; GFX9-LABEL: v_sitofp_i64_to_bf16:
31272 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31273 ; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1
31274 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31275 ; GFX9-NEXT: v_ffbh_i32_e32 v3, v1
31276 ; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
31277 ; GFX9-NEXT: v_add_u32_e32 v3, -1, v3
31278 ; GFX9-NEXT: v_min_u32_e32 v2, v3, v2
31279 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31280 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31281 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31282 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31283 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31284 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
31285 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
31286 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
31287 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
31288 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
31289 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31290 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
31291 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31292 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31294 ; GFX10-LABEL: v_sitofp_i64_to_bf16:
31296 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31297 ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1
31298 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
31299 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31300 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
31301 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
31302 ; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
31303 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31304 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
31305 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
31306 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
31307 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31308 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
31309 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
31310 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
31311 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31312 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
31313 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
31314 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31315 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31317 ; GFX11-LABEL: v_sitofp_i64_to_bf16:
31319 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31320 ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1
31321 ; GFX11-NEXT: v_cls_i32_e32 v3, v1
31322 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31323 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31324 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
31325 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
31326 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
31327 ; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
31328 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
31329 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31330 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
31331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31332 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
31333 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
31334 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
31335 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
31336 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
31337 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
31338 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
31339 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31340 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
31341 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
31342 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
31343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
31344 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31345 ; GFX11-NEXT: s_setpc_b64 s[30:31]
31346 %op = sitofp i64 %x to bfloat
31350 define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
31351 ; GCN-LABEL: v_sitofp_v2i64_to_v2bf16:
31353 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31354 ; GCN-NEXT: v_ffbh_i32_e32 v4, v3
31355 ; GCN-NEXT: v_xor_b32_e32 v5, v2, v3
31356 ; GCN-NEXT: v_ffbh_i32_e32 v6, v1
31357 ; GCN-NEXT: v_xor_b32_e32 v7, v0, v1
31358 ; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v4
31359 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31360 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31361 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31362 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31363 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31364 ; GCN-NEXT: v_min_u32_e32 v4, v4, v5
31365 ; GCN-NEXT: v_min_u32_e32 v5, v6, v7
31366 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
31367 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
31368 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
31369 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5
31370 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
31371 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
31372 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
31373 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
31374 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2
31375 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31376 ; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
31377 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
31378 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31379 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31380 ; GCN-NEXT: s_setpc_b64 s[30:31]
31382 ; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16:
31384 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31385 ; GFX7-NEXT: v_xor_b32_e32 v5, v2, v3
31386 ; GFX7-NEXT: v_ffbh_i32_e32 v4, v3
31387 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31388 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4
31389 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31390 ; GFX7-NEXT: v_min_u32_e32 v4, v4, v5
31391 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
31392 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
31393 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
31394 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
31395 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
31396 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31397 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31398 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31399 ; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
31400 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
31401 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
31402 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
31403 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
31404 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31405 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
31406 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
31407 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
31408 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
31409 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31410 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31411 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31413 ; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
31415 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31416 ; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1
31417 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v1
31418 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31419 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
31420 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
31421 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v5
31422 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31423 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
31424 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31425 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31426 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31427 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
31428 ; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
31429 ; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
31430 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
31431 ; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
31432 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0
31433 ; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
31434 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
31435 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
31436 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
31437 ; GFX8-NEXT: v_min_u32_e32 v6, v0, v1
31438 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
31439 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
31440 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31441 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31442 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31443 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31444 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
31445 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
31446 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
31447 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
31448 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
31449 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
31450 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
31451 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31452 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
31453 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31454 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
31455 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31457 ; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
31459 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31460 ; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1
31461 ; GFX9-NEXT: v_ffbh_i32_e32 v4, v1
31462 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31463 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
31464 ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
31465 ; GFX9-NEXT: v_min_u32_e32 v4, v4, v5
31466 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31467 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31468 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31469 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31470 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31471 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
31472 ; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
31473 ; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
31474 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
31475 ; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
31476 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
31477 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
31478 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
31479 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
31480 ; GFX9-NEXT: v_min_u32_e32 v6, v0, v1
31481 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
31482 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
31483 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31484 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31485 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31486 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31487 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
31488 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
31489 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
31490 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
31491 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
31492 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
31493 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31494 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
31495 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
31496 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
31497 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31499 ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
31501 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31502 ; GFX10-NEXT: v_xor_b32_e32 v4, v0, v1
31503 ; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3
31504 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v1
31505 ; GFX10-NEXT: v_ffbh_i32_e32 v7, v3
31506 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4
31507 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31508 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
31509 ; GFX10-NEXT: v_add_nc_u32_e32 v7, -1, v7
31510 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 32, v4
31511 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 32, v5
31512 ; GFX10-NEXT: v_min_u32_e32 v4, v6, v4
31513 ; GFX10-NEXT: v_min_u32_e32 v5, v7, v5
31514 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31515 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
31516 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
31517 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
31518 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
31519 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
31520 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4
31521 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5
31522 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31523 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
31524 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
31525 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
31526 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
31527 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
31528 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
31529 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31530 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
31531 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
31532 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
31533 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
31534 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31535 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
31536 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31537 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31539 ; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
31541 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31542 ; GFX11-NEXT: v_xor_b32_e32 v4, v0, v1
31543 ; GFX11-NEXT: v_xor_b32_e32 v5, v2, v3
31544 ; GFX11-NEXT: v_cls_i32_e32 v6, v1
31545 ; GFX11-NEXT: v_cls_i32_e32 v7, v3
31546 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31547 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v4
31548 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31549 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31550 ; GFX11-NEXT: v_add_nc_u32_e32 v6, -1, v6
31551 ; GFX11-NEXT: v_add_nc_u32_e32 v7, -1, v7
31552 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31553 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 32, v4
31554 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 32, v5
31555 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31556 ; GFX11-NEXT: v_min_u32_e32 v4, v6, v4
31557 ; GFX11-NEXT: v_min_u32_e32 v5, v7, v5
31558 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31559 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31560 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
31561 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31562 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
31563 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
31564 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31565 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
31566 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
31567 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4
31568 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5
31569 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31570 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
31571 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
31572 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31573 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
31574 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
31575 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31576 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
31577 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
31578 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
31579 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31580 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
31581 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
31582 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
31583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
31584 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
31585 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31586 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
31587 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
31588 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31589 ; GFX11-NEXT: s_setpc_b64 s[30:31]
31590 %op = sitofp <2 x i64> %x to <2 x bfloat>
31591 ret <2 x bfloat> %op
31594 define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
31595 ; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
31597 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31598 ; GCN-NEXT: v_ffbh_i32_e32 v6, v5
31599 ; GCN-NEXT: v_xor_b32_e32 v7, v4, v5
31600 ; GCN-NEXT: v_ffbh_i32_e32 v8, v3
31601 ; GCN-NEXT: v_xor_b32_e32 v9, v2, v3
31602 ; GCN-NEXT: v_ffbh_i32_e32 v10, v1
31603 ; GCN-NEXT: v_xor_b32_e32 v11, v0, v1
31604 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31605 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31606 ; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
31607 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31608 ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10
31609 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
31610 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31611 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9
31612 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11
31613 ; GCN-NEXT: v_min_u32_e32 v6, v6, v7
31614 ; GCN-NEXT: v_min_u32_e32 v7, v8, v9
31615 ; GCN-NEXT: v_min_u32_e32 v8, v10, v11
31616 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
31617 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
31618 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
31619 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
31620 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
31621 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
31622 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
31623 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
31624 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
31625 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
31626 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
31627 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
31628 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v4
31629 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
31630 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31631 ; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
31632 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
31633 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
31634 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31635 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31636 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
31637 ; GCN-NEXT: s_setpc_b64 s[30:31]
31639 ; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16:
31641 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31642 ; GFX7-NEXT: v_xor_b32_e32 v7, v4, v5
31643 ; GFX7-NEXT: v_ffbh_i32_e32 v6, v5
31644 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31645 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31646 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31647 ; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
31648 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
31649 ; GFX7-NEXT: v_xor_b32_e32 v7, v2, v3
31650 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
31651 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
31652 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
31653 ; GFX7-NEXT: v_ffbh_i32_e32 v6, v3
31654 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31655 ; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
31656 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31657 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31658 ; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
31659 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
31660 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
31661 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
31662 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
31663 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
31664 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
31665 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31666 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31667 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31668 ; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
31669 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
31670 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
31671 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
31672 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
31673 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31674 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
31675 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
31676 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
31677 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
31678 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31679 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31680 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
31681 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31683 ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
31685 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31686 ; GFX8-NEXT: v_xor_b32_e32 v7, v4, v5
31687 ; GFX8-NEXT: v_ffbh_i32_e32 v6, v5
31688 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31689 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6
31690 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7
31691 ; GFX8-NEXT: v_min_u32_e32 v6, v6, v7
31692 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31693 ; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1
31694 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
31695 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
31696 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
31697 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
31698 ; GFX8-NEXT: v_ffbh_i32_e32 v7, v1
31699 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
31700 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31701 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
31702 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7
31703 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8
31704 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
31705 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
31706 ; GFX8-NEXT: v_min_u32_e32 v7, v7, v8
31707 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
31708 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
31709 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
31710 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31711 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
31712 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31713 ; GFX8-NEXT: v_xor_b32_e32 v6, v2, v3
31714 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31715 ; GFX8-NEXT: v_ffbh_i32_e32 v5, v3
31716 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
31717 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31718 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5
31719 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6
31720 ; GFX8-NEXT: v_min_u32_e32 v5, v5, v6
31721 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
31722 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
31723 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7
31724 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
31725 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
31726 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
31727 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
31728 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
31729 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
31730 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
31731 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
31732 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31733 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
31734 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
31735 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
31736 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
31737 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
31738 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
31739 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
31740 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
31741 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
31742 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
31743 ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
31744 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31746 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
31748 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31749 ; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5
31750 ; GFX9-NEXT: v_ffbh_i32_e32 v6, v5
31751 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31752 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
31753 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
31754 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
31755 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31756 ; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1
31757 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
31758 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
31759 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
31760 ; GFX9-NEXT: v_ffbh_i32_e32 v6, v1
31761 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31762 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
31763 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
31764 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
31765 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
31766 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
31767 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31768 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31769 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31770 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
31771 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
31772 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31773 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
31774 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
31775 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31776 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
31777 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
31778 ; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
31779 ; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
31780 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
31781 ; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
31782 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
31783 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
31784 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
31785 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
31786 ; GFX9-NEXT: v_min_u32_e32 v7, v0, v1
31787 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
31788 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
31789 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31790 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31791 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31792 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
31793 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
31794 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
31795 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
31796 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
31797 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
31798 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
31799 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31800 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
31801 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
31802 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
31803 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
31804 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31806 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
31808 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31809 ; GFX10-NEXT: v_xor_b32_e32 v8, v0, v1
31810 ; GFX10-NEXT: v_xor_b32_e32 v7, v4, v5
31811 ; GFX10-NEXT: v_xor_b32_e32 v9, v2, v3
31812 ; GFX10-NEXT: v_ffbh_i32_e32 v10, v1
31813 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v5
31814 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31815 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31816 ; GFX10-NEXT: v_ffbh_i32_e32 v11, v3
31817 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31818 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
31819 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
31820 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
31821 ; GFX10-NEXT: v_add_nc_u32_e32 v7, 32, v7
31822 ; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11
31823 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
31824 ; GFX10-NEXT: v_min_u32_e32 v8, v10, v8
31825 ; GFX10-NEXT: v_min_u32_e32 v6, v6, v7
31826 ; GFX10-NEXT: v_min_u32_e32 v7, v11, v9
31827 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
31828 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31829 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v6
31830 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
31831 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
31832 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
31833 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
31834 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
31835 ; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
31836 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7
31837 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
31838 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8
31839 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31840 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
31841 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
31842 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
31843 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
31844 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v4
31845 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
31846 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
31847 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31848 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
31849 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
31850 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31851 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
31852 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
31853 ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
31854 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
31855 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31856 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31857 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31858 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31859 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
31860 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31861 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
31862 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31864 ; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16:
31865 ; GFX11TRUE16: ; %bb.0:
31866 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31867 ; GFX11TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1
31868 ; GFX11TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5
31869 ; GFX11TRUE16-NEXT: v_xor_b32_e32 v9, v2, v3
31870 ; GFX11TRUE16-NEXT: v_cls_i32_e32 v10, v1
31871 ; GFX11TRUE16-NEXT: v_cls_i32_e32 v6, v5
31872 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31873 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31874 ; GFX11TRUE16-NEXT: v_cls_i32_e32 v11, v3
31875 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31876 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
31877 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
31878 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
31879 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v7, 32, v7
31880 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v11, -1, v11
31881 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
31882 ; GFX11TRUE16-NEXT: v_min_u32_e32 v8, v10, v8
31883 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31884 ; GFX11TRUE16-NEXT: v_min_u32_e32 v6, v6, v7
31885 ; GFX11TRUE16-NEXT: v_min_u32_e32 v7, v11, v9
31886 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31887 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
31888 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31889 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
31890 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31891 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
31892 ; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
31893 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31894 ; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
31895 ; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
31896 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31897 ; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
31898 ; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v4
31899 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
31900 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
31901 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
31902 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8
31903 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
31904 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
31905 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
31906 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31907 ; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
31908 ; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v6
31909 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31910 ; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
31911 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
31912 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
31913 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31914 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
31915 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
31916 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
31917 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31918 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
31919 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
31920 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
31921 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
31922 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31923 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31924 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31925 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31926 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31927 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
31928 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31929 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
31930 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
31931 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
31933 ; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
31934 ; GFX11FAKE16: ; %bb.0:
31935 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31936 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v8, v0, v1
31937 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v7, v4, v5
31938 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v9, v2, v3
31939 ; GFX11FAKE16-NEXT: v_cls_i32_e32 v10, v1
31940 ; GFX11FAKE16-NEXT: v_cls_i32_e32 v6, v5
31941 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31942 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31943 ; GFX11FAKE16-NEXT: v_cls_i32_e32 v11, v3
31944 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31945 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
31946 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
31947 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
31948 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v7, 32, v7
31949 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v11, -1, v11
31950 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
31951 ; GFX11FAKE16-NEXT: v_min_u32_e32 v8, v10, v8
31952 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31953 ; GFX11FAKE16-NEXT: v_min_u32_e32 v6, v6, v7
31954 ; GFX11FAKE16-NEXT: v_min_u32_e32 v7, v11, v9
31955 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31956 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
31957 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31958 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
31959 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31960 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
31961 ; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
31962 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31963 ; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
31964 ; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
31965 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31966 ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
31967 ; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
31968 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
31969 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
31970 ; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
31971 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8
31972 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
31973 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
31974 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
31975 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31976 ; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
31977 ; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v6
31978 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31979 ; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
31980 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
31981 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
31982 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31983 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
31984 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
31985 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
31986 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31987 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
31988 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
31989 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
31990 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
31991 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31992 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31993 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31994 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31995 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31996 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
31997 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31998 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
31999 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
32000 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
32001 %op = sitofp <3 x i64> %x to <3 x bfloat>
32002 ret <3 x bfloat> %op
32005 define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
32006 ; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
32008 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32009 ; GCN-NEXT: v_ffbh_i32_e32 v8, v7
32010 ; GCN-NEXT: v_xor_b32_e32 v9, v6, v7
32011 ; GCN-NEXT: v_ffbh_i32_e32 v10, v5
32012 ; GCN-NEXT: v_xor_b32_e32 v11, v4, v5
32013 ; GCN-NEXT: v_ffbh_i32_e32 v12, v3
32014 ; GCN-NEXT: v_xor_b32_e32 v13, v2, v3
32015 ; GCN-NEXT: v_ffbh_i32_e32 v14, v1
32016 ; GCN-NEXT: v_xor_b32_e32 v15, v0, v1
32017 ; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
32018 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32019 ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10
32020 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
32021 ; GCN-NEXT: v_add_i32_e32 v12, vcc, -1, v12
32022 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v13
32023 ; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v14
32024 ; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v15
32025 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9
32026 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11
32027 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v13
32028 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 32, v15
32029 ; GCN-NEXT: v_min_u32_e32 v8, v8, v9
32030 ; GCN-NEXT: v_min_u32_e32 v9, v10, v11
32031 ; GCN-NEXT: v_min_u32_e32 v10, v12, v13
32032 ; GCN-NEXT: v_min_u32_e32 v11, v14, v15
32033 ; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
32034 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
32035 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
32036 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
32037 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
32038 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
32039 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
32040 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
32041 ; GCN-NEXT: v_min_u32_e32 v6, 1, v6
32042 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
32043 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
32044 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
32045 ; GCN-NEXT: v_or_b32_e32 v6, v7, v6
32046 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
32047 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
32048 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
32049 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v6
32050 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v4
32051 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
32052 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
32053 ; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
32054 ; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
32055 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
32056 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
32057 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
32058 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
32059 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
32060 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
32061 ; GCN-NEXT: s_setpc_b64 s[30:31]
32063 ; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16:
32065 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32066 ; GFX7-NEXT: v_xor_b32_e32 v9, v6, v7
32067 ; GFX7-NEXT: v_ffbh_i32_e32 v8, v7
32068 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32069 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
32070 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
32071 ; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
32072 ; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
32073 ; GFX7-NEXT: v_xor_b32_e32 v9, v4, v5
32074 ; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
32075 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
32076 ; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
32077 ; GFX7-NEXT: v_ffbh_i32_e32 v8, v5
32078 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32079 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
32080 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
32081 ; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
32082 ; GFX7-NEXT: v_cvt_f32_i32_e32 v6, v6
32083 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
32084 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
32085 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
32086 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
32087 ; GFX7-NEXT: v_xor_b32_e32 v8, v2, v3
32088 ; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
32089 ; GFX7-NEXT: v_ffbh_i32_e32 v7, v3
32090 ; GFX7-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32091 ; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
32092 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, -1, v7
32093 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v8
32094 ; GFX7-NEXT: v_min_u32_e32 v7, v7, v8
32095 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
32096 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
32097 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
32098 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
32099 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
32100 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
32101 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
32102 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
32103 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
32104 ; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
32105 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
32106 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
32107 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
32108 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
32109 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
32110 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
32111 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
32112 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
32113 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
32114 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
32115 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
32116 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
32117 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
32118 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32120 ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
32122 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32123 ; GFX8-NEXT: v_xor_b32_e32 v9, v4, v5
32124 ; GFX8-NEXT: v_ffbh_i32_e32 v8, v5
32125 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32126 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
32127 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
32128 ; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
32129 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32130 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
32131 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
32132 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
32133 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
32134 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
32135 ; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
32136 ; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
32137 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
32138 ; GFX8-NEXT: v_xor_b32_e32 v5, v6, v7
32139 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4
32140 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v7
32141 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
32142 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
32143 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
32144 ; GFX8-NEXT: v_min_u32_e32 v10, v4, v5
32145 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
32146 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
32147 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
32148 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
32149 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
32150 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
32151 ; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1
32152 ; GFX8-NEXT: v_ffbh_i32_e32 v8, v1
32153 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32154 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
32155 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
32156 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
32157 ; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
32158 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
32159 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
32160 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
32161 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
32162 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
32163 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
32164 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
32165 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
32166 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
32167 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
32168 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32169 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
32170 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
32171 ; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
32172 ; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
32173 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
32174 ; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
32175 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0
32176 ; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
32177 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
32178 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
32179 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
32180 ; GFX8-NEXT: v_min_u32_e32 v8, v0, v1
32181 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
32182 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6
32183 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
32184 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
32185 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
32186 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
32187 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
32188 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
32189 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
32190 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
32191 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
32192 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32193 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
32194 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32195 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32196 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
32197 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32198 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
32199 ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
32200 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32202 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
32204 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32205 ; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5
32206 ; GFX9-NEXT: v_ffbh_i32_e32 v8, v5
32207 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32208 ; GFX9-NEXT: v_add_u32_e32 v8, -1, v8
32209 ; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
32210 ; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
32211 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32212 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32213 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
32214 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
32215 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
32216 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
32217 ; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
32218 ; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
32219 ; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7
32220 ; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
32221 ; GFX9-NEXT: v_ffbh_i32_e32 v4, v7
32222 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
32223 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
32224 ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
32225 ; GFX9-NEXT: v_min_u32_e32 v10, v4, v5
32226 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
32227 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
32228 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
32229 ; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1
32230 ; GFX9-NEXT: v_ffbh_i32_e32 v7, v1
32231 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32232 ; GFX9-NEXT: v_add_u32_e32 v7, -1, v7
32233 ; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
32234 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
32235 ; GFX9-NEXT: v_min_u32_e32 v7, v7, v8
32236 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
32237 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
32238 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
32239 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
32240 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
32241 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
32242 ; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
32243 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
32244 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
32245 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
32246 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
32247 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
32248 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32249 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
32250 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
32251 ; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
32252 ; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
32253 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
32254 ; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
32255 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
32256 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
32257 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
32258 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
32259 ; GFX9-NEXT: v_min_u32_e32 v8, v0, v1
32260 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
32261 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
32262 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
32263 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
32264 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
32265 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
32266 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
32267 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
32268 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
32269 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
32270 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
32271 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
32272 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32273 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32274 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32275 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
32276 ; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
32277 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32279 ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
32281 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32282 ; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5
32283 ; GFX10-NEXT: v_ffbh_i32_e32 v9, v5
32284 ; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7
32285 ; GFX10-NEXT: v_xor_b32_e32 v13, v0, v1
32286 ; GFX10-NEXT: v_ffbh_i32_e32 v10, v7
32287 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32288 ; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9
32289 ; GFX10-NEXT: v_ffbh_i32_e32 v12, v1
32290 ; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3
32291 ; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v11
32292 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
32293 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
32294 ; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12
32295 ; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14
32296 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 32, v11
32297 ; GFX10-NEXT: v_min_u32_e32 v8, v9, v8
32298 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v13
32299 ; GFX10-NEXT: v_ffbh_i32_e32 v13, v3
32300 ; GFX10-NEXT: v_add_nc_u32_e32 v14, 32, v14
32301 ; GFX10-NEXT: v_min_u32_e32 v10, v10, v11
32302 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32303 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
32304 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13
32305 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
32306 ; GFX10-NEXT: v_min_u32_e32 v9, v12, v9
32307 ; GFX10-NEXT: v_min_u32_e32 v11, v13, v14
32308 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
32309 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
32310 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
32311 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
32312 ; GFX10-NEXT: v_min_u32_e32 v5, 1, v6
32313 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v8
32314 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
32315 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
32316 ; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v4
32317 ; GFX10-NEXT: v_or_b32_e32 v5, v7, v5
32318 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
32319 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
32320 ; GFX10-NEXT: v_ldexp_f32 v2, v4, v6
32321 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v5
32322 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v10
32323 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
32324 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9
32325 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
32326 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11
32327 ; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
32328 ; GFX10-NEXT: v_ldexp_f32 v3, v3, v4
32329 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
32330 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
32331 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
32332 ; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
32333 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
32334 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
32335 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32336 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
32337 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
32338 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32339 ; GFX10-NEXT: v_add3_u32 v4, v6, v3, 0x7fff
32340 ; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
32341 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
32342 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32343 ; GFX10-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
32344 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
32345 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
32346 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32347 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
32348 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32349 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
32350 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
32351 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
32352 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32354 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
32356 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32357 ; GFX11-NEXT: v_xor_b32_e32 v8, v4, v5
32358 ; GFX11-NEXT: v_cls_i32_e32 v9, v5
32359 ; GFX11-NEXT: v_xor_b32_e32 v11, v6, v7
32360 ; GFX11-NEXT: v_xor_b32_e32 v13, v0, v1
32361 ; GFX11-NEXT: v_cls_i32_e32 v10, v7
32362 ; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32363 ; GFX11-NEXT: v_add_nc_u32_e32 v9, -1, v9
32364 ; GFX11-NEXT: v_cls_i32_e32 v12, v1
32365 ; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3
32366 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
32367 ; GFX11-NEXT: v_add_nc_u32_e32 v8, 32, v8
32368 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
32369 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
32370 ; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v14
32371 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
32372 ; GFX11-NEXT: v_min_u32_e32 v8, v9, v8
32373 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v13
32374 ; GFX11-NEXT: v_cls_i32_e32 v13, v3
32375 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 32, v14
32376 ; GFX11-NEXT: v_min_u32_e32 v10, v10, v11
32377 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32378 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
32379 ; GFX11-NEXT: v_add_nc_u32_e32 v13, -1, v13
32380 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
32381 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
32382 ; GFX11-NEXT: v_min_u32_e32 v9, v12, v9
32383 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32384 ; GFX11-NEXT: v_min_u32_e32 v11, v13, v14
32385 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
32386 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
32387 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
32388 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
32389 ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
32390 ; GFX11-NEXT: v_min_u32_e32 v5, 1, v6
32391 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v8
32392 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
32393 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
32394 ; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4
32395 ; GFX11-NEXT: v_or_b32_e32 v5, v7, v5
32396 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32397 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
32398 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
32399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32400 ; GFX11-NEXT: v_ldexp_f32 v2, v4, v6
32401 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v5
32402 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v10
32403 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
32404 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9
32405 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
32406 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11
32407 ; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
32408 ; GFX11-NEXT: v_ldexp_f32 v3, v3, v4
32409 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5
32410 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
32411 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v6
32412 ; GFX11-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
32413 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
32414 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
32415 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32416 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
32417 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
32418 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32419 ; GFX11-NEXT: v_add3_u32 v4, v6, v3, 0x7fff
32420 ; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
32421 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
32422 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32423 ; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
32424 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
32425 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32426 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
32427 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32428 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
32429 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32430 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32431 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
32432 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
32433 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
32434 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32435 %op = sitofp <4 x i64> %x to <4 x bfloat>
32436 ret <4 x bfloat> %op
32439 define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
32440 ; GCN-LABEL: v_uitofp_i16_to_bf16:
32442 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32443 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32444 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32445 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32446 ; GCN-NEXT: s_setpc_b64 s[30:31]
32448 ; GFX7-LABEL: v_uitofp_i16_to_bf16:
32450 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32451 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32452 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32453 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32454 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32456 ; GFX8-LABEL: v_uitofp_i16_to_bf16:
32458 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32459 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32460 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
32461 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
32462 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
32463 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
32464 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32465 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
32466 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32467 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32469 ; GFX9-LABEL: v_uitofp_i16_to_bf16:
32471 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32472 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32473 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32474 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
32475 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
32476 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
32477 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32478 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
32479 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32480 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32482 ; GFX10-LABEL: v_uitofp_i16_to_bf16:
32484 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32485 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32486 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
32487 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
32488 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32489 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
32490 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
32491 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32492 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32494 ; GFX11-LABEL: v_uitofp_i16_to_bf16:
32496 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32497 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
32498 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32499 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
32500 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
32501 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
32502 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32503 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
32504 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
32505 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
32506 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
32507 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32508 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32509 %op = uitofp i16 %x to bfloat
32513 define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
32514 ; GCN-LABEL: v_uitofp_v2i16_to_v2bf16:
32516 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32517 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32518 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
32519 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
32520 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32521 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32522 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32523 ; GCN-NEXT: s_setpc_b64 s[30:31]
32525 ; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
32527 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32528 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32529 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32530 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32531 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
32532 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32533 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32534 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32536 ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
32538 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32539 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32540 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32541 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
32542 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
32543 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32544 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
32545 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32546 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
32547 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
32548 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
32549 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32550 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
32551 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32552 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32553 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32554 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
32555 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32557 ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
32559 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32560 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32561 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32562 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32563 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
32564 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
32565 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
32566 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32567 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
32568 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
32569 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
32570 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
32571 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32572 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32573 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32574 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
32575 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32577 ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
32579 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32580 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32581 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32582 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
32583 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
32584 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
32585 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32586 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
32587 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
32588 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
32589 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
32590 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32591 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
32592 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
32593 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32595 ; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
32597 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32598 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0
32599 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32600 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32601 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
32602 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
32603 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32604 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
32605 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
32606 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
32607 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32608 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
32609 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
32610 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
32611 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32612 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
32613 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32614 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
32615 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
32616 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
32617 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32618 %op = uitofp <2 x i16> %x to <2 x bfloat>
32619 ret <2 x bfloat> %op
32622 define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
32623 ; GCN-LABEL: v_uitofp_v3i16_to_v3bf16:
32625 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32626 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32627 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
32628 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
32629 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
32630 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
32631 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32632 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32633 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32634 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32635 ; GCN-NEXT: s_setpc_b64 s[30:31]
32637 ; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
32639 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32640 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32641 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32642 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
32643 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32644 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
32645 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
32646 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32647 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32648 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32649 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32651 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
32653 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32654 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32655 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32656 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32657 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
32658 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
32659 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
32660 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32661 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
32662 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32663 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
32664 ; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1
32665 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
32666 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
32667 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4
32668 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32669 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
32670 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
32671 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
32672 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
32673 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
32674 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32675 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
32676 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32677 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
32678 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
32679 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32681 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
32683 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32684 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32685 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32686 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32687 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32688 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
32689 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
32690 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
32691 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32692 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
32693 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
32694 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
32695 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
32696 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
32697 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
32698 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
32699 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
32700 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
32701 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32702 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
32703 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32704 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
32705 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
32706 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32708 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
32710 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32711 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32712 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32713 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32714 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
32715 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
32716 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
32717 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32718 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
32719 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
32720 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
32721 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
32722 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
32723 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
32724 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32725 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32726 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32727 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32728 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
32729 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32730 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
32731 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32733 ; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16:
32734 ; GFX11TRUE16: ; %bb.0:
32735 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32736 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
32737 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32738 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
32739 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
32740 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
32741 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
32742 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
32743 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
32744 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32745 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
32746 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32747 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
32748 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
32749 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
32750 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
32751 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
32752 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32753 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
32754 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
32755 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
32756 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32757 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32758 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32759 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32760 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32761 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32762 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
32763 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32764 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
32765 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
32767 ; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
32768 ; GFX11FAKE16: ; %bb.0:
32769 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32770 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
32771 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32772 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
32773 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
32774 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
32775 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
32776 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
32777 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
32778 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32779 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
32780 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32781 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
32782 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
32783 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
32784 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
32785 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
32786 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32787 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
32788 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
32789 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
32790 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32791 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32792 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32793 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32794 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32795 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32796 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
32797 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32798 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
32799 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
32800 %op = uitofp <3 x i16> %x to <3 x bfloat>
32801 ret <3 x bfloat> %op
32804 define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
32805 ; GCN-LABEL: v_uitofp_v4i16_to_v4bf16:
32807 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32808 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32809 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
32810 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
32811 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
32812 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
32813 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
32814 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
32815 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32816 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32817 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32818 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32819 ; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
32820 ; GCN-NEXT: s_setpc_b64 s[30:31]
32822 ; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
32824 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32825 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32826 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32827 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
32828 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
32829 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32830 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
32831 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
32832 ; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
32833 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32834 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32835 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32836 ; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
32837 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32839 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
32841 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32842 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32843 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32844 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32845 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
32846 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
32847 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
32848 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
32849 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
32850 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
32851 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
32852 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
32853 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
32854 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
32855 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
32856 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32857 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
32858 ; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1
32859 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32860 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
32861 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
32862 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5
32863 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
32864 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
32865 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
32866 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
32867 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
32868 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
32869 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32870 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
32871 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
32872 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32873 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
32874 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
32875 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32877 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
32879 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32880 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32881 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32882 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32883 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
32884 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
32885 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
32886 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
32887 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
32888 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32889 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32890 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
32891 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
32892 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
32893 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32894 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
32895 ; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
32896 ; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
32897 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
32898 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32899 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
32900 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
32901 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
32902 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
32903 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32904 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
32905 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32906 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
32907 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
32908 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32910 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
32912 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32913 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32914 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32915 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32916 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32917 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
32918 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
32919 ; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
32920 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32921 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
32922 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
32923 ; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
32924 ; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
32925 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
32926 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
32927 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32928 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32929 ; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
32930 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
32931 ; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
32932 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
32933 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32934 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
32935 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32936 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
32937 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
32938 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
32939 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32941 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
32943 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32944 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1
32945 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
32946 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32947 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
32948 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
32949 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
32950 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
32951 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
32952 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32953 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
32954 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
32955 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
32956 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0
32957 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32958 ; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
32959 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32960 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32961 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
32962 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
32963 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
32964 ; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
32965 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
32966 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
32967 ; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1
32968 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32969 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
32970 ; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
32971 ; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
32972 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32973 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
32974 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32975 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
32976 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32977 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32978 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
32979 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
32980 ; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
32981 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32982 %op = uitofp <4 x i16> %x to <4 x bfloat>
32983 ret <4 x bfloat> %op
32986 define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
32987 ; GCN-LABEL: v_uitofp_i32_to_bf16:
32989 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32990 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32991 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32992 ; GCN-NEXT: s_setpc_b64 s[30:31]
32994 ; GFX7-LABEL: v_uitofp_i32_to_bf16:
32996 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32997 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32998 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32999 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33001 ; GFX8-LABEL: v_uitofp_i32_to_bf16:
33003 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33004 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33005 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
33006 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
33007 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
33008 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
33009 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33010 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
33011 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33012 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33014 ; GFX9-LABEL: v_uitofp_i32_to_bf16:
33016 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33017 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33018 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33019 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
33020 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
33021 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
33022 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33023 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
33024 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33025 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33027 ; GFX10-LABEL: v_uitofp_i32_to_bf16:
33029 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33030 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33031 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
33032 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
33033 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33034 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33035 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33036 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33037 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33039 ; GFX11-LABEL: v_uitofp_i32_to_bf16:
33041 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33042 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33043 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33044 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
33045 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
33046 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33047 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33048 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33049 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33050 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33051 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33052 %op = uitofp i32 %x to bfloat
33056 define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
33057 ; GCN-LABEL: v_uitofp_v2i32_to_v2bf16:
33059 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33060 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
33061 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33062 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33063 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33064 ; GCN-NEXT: s_setpc_b64 s[30:31]
33066 ; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
33068 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33069 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33070 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
33071 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33072 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33073 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33075 ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
33077 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33078 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33079 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
33080 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
33081 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
33082 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
33083 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
33084 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33085 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
33086 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
33087 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
33088 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
33089 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
33090 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33091 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
33092 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
33093 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
33094 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33096 ; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
33098 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33099 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33100 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
33101 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33102 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
33103 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
33104 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
33105 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33106 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33107 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
33108 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
33109 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
33110 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33111 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
33112 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33113 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
33114 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33116 ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
33118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33119 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33120 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33121 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
33122 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
33123 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
33124 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33125 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
33126 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33127 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33128 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33129 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33130 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33131 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33132 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33134 ; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
33136 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33137 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33138 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
33139 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33140 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
33141 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
33142 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
33143 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33144 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
33145 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33146 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33147 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
33148 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33149 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33150 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33151 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
33152 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33153 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33154 %op = uitofp <2 x i32> %x to <2 x bfloat>
33155 ret <2 x bfloat> %op
33158 define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
33159 ; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
33161 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33162 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
33163 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
33164 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33165 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33166 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33167 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33168 ; GCN-NEXT: s_setpc_b64 s[30:31]
33170 ; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
33172 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33173 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33174 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
33175 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33176 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33177 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33178 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33179 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33181 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
33183 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33184 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
33185 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33186 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
33187 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
33188 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
33189 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
33190 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
33191 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33192 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
33193 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
33194 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
33195 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
33196 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
33197 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33198 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
33199 ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
33200 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
33201 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
33202 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
33203 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33204 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
33205 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
33206 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
33207 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
33208 ; GFX8-NEXT: v_mov_b32_e32 v1, v2
33209 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33211 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
33213 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33214 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
33215 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33216 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33217 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
33218 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
33219 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
33220 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
33221 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33222 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
33223 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
33224 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
33225 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
33226 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33227 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
33228 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
33229 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
33230 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
33231 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33232 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
33233 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33234 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
33235 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
33236 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33238 ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
33240 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33241 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33242 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33243 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
33244 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
33245 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
33246 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
33247 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33248 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
33249 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33250 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
33251 ; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
33252 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
33253 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33254 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33255 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33256 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33257 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33258 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33259 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33260 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
33261 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33263 ; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16:
33264 ; GFX11TRUE16: ; %bb.0:
33265 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33266 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
33267 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
33268 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
33269 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
33270 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
33271 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
33272 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
33273 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33274 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
33275 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33276 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
33277 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
33278 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
33279 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33280 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33281 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33282 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33283 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33284 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
33285 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33286 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33287 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
33288 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
33290 ; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
33291 ; GFX11FAKE16: ; %bb.0:
33292 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33293 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
33294 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
33295 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
33296 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
33297 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
33298 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
33299 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
33300 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33301 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
33302 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33303 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
33304 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
33305 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
33306 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33307 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33308 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33309 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33310 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33311 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
33312 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33313 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33314 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
33315 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
33316 %op = uitofp <3 x i32> %x to <3 x bfloat>
33317 ret <3 x bfloat> %op
33320 define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
33321 ; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
33323 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33324 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
33325 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
33326 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
33327 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33328 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33329 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33330 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33331 ; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
33332 ; GCN-NEXT: s_setpc_b64 s[30:31]
33334 ; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
33336 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33337 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33338 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
33339 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33340 ; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
33341 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33342 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33343 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33344 ; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
33345 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33347 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
33349 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33350 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
33351 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3
33352 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33353 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
33354 ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
33355 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
33356 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
33357 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
33358 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33359 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
33360 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
33361 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
33362 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
33363 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3
33364 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
33365 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
33366 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
33367 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
33368 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
33369 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
33370 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
33371 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33372 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
33373 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
33374 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
33375 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
33376 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
33377 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33378 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
33379 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
33380 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
33381 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
33382 ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
33383 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33385 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
33387 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33388 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
33389 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
33390 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33391 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33392 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
33393 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
33394 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
33395 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33396 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
33397 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
33398 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
33399 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
33400 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
33401 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
33402 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
33403 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
33404 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
33405 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
33406 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33407 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
33408 ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
33409 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
33410 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
33411 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33412 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
33413 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33414 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
33415 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
33416 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33418 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
33420 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33421 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
33422 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33423 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33424 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
33425 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
33426 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
33427 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
33428 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33429 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
33430 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33431 ; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
33432 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
33433 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
33434 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
33435 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
33436 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33437 ; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
33438 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
33439 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
33440 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
33441 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33442 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
33443 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
33444 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33445 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
33446 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
33447 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33449 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
33451 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33452 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
33453 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33454 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
33455 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
33456 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
33457 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
33458 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
33459 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
33460 ; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1
33461 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33462 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33463 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
33464 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
33465 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
33466 ; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
33467 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
33468 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33469 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
33470 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
33471 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
33472 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33473 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
33474 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
33475 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
33476 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
33477 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33478 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
33479 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
33480 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
33481 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33482 %op = uitofp <4 x i32> %x to <4 x bfloat>
33483 ret <4 x bfloat> %op
33486 define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
33487 ; GCN-LABEL: v_uitofp_i64_to_bf16:
33489 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33490 ; GCN-NEXT: v_ffbh_u32_e32 v2, v1
33491 ; GCN-NEXT: v_min_u32_e32 v2, 32, v2
33492 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
33493 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
33494 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
33495 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33496 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
33497 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
33498 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33499 ; GCN-NEXT: s_setpc_b64 s[30:31]
33501 ; GFX7-LABEL: v_uitofp_i64_to_bf16:
33503 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33504 ; GFX7-NEXT: v_ffbh_u32_e32 v2, v1
33505 ; GFX7-NEXT: v_min_u32_e32 v2, 32, v2
33506 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
33507 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
33508 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
33509 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33510 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
33511 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
33512 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33513 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33515 ; GFX8-LABEL: v_uitofp_i64_to_bf16:
33517 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33518 ; GFX8-NEXT: v_ffbh_u32_e32 v2, v1
33519 ; GFX8-NEXT: v_min_u32_e32 v2, 32, v2
33520 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33521 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33522 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33523 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33524 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
33525 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
33526 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
33527 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
33528 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
33529 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
33530 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33531 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
33532 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33533 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33535 ; GFX9-LABEL: v_uitofp_i64_to_bf16:
33537 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33538 ; GFX9-NEXT: v_ffbh_u32_e32 v2, v1
33539 ; GFX9-NEXT: v_min_u32_e32 v2, 32, v2
33540 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33541 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33542 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33543 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33544 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33545 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
33546 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
33547 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
33548 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
33549 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
33550 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33551 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
33552 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33553 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33555 ; GFX10-LABEL: v_uitofp_i64_to_bf16:
33557 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33558 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
33559 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
33560 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33561 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
33562 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
33563 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
33564 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33565 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
33566 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
33567 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
33568 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33569 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33570 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33571 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33572 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33574 ; GFX11-LABEL: v_uitofp_i64_to_bf16:
33576 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33577 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
33578 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33579 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
33580 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33581 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33582 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
33583 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
33584 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
33585 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
33586 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33587 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
33588 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33589 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
33590 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
33591 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33592 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33593 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33594 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33595 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33596 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33597 %op = uitofp i64 %x to bfloat
33601 define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
33602 ; GCN-LABEL: v_uitofp_v2i64_to_v2bf16:
33604 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33605 ; GCN-NEXT: v_ffbh_u32_e32 v4, v3
33606 ; GCN-NEXT: v_ffbh_u32_e32 v5, v1
33607 ; GCN-NEXT: v_min_u32_e32 v4, 32, v4
33608 ; GCN-NEXT: v_min_u32_e32 v5, 32, v5
33609 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
33610 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
33611 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
33612 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5
33613 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
33614 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
33615 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
33616 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
33617 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2
33618 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33619 ; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
33620 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
33621 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33622 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33623 ; GCN-NEXT: s_setpc_b64 s[30:31]
33625 ; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16:
33627 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33628 ; GFX7-NEXT: v_ffbh_u32_e32 v4, v3
33629 ; GFX7-NEXT: v_min_u32_e32 v4, 32, v4
33630 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
33631 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
33632 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
33633 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
33634 ; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
33635 ; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
33636 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
33637 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33638 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
33639 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
33640 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33641 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
33642 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
33643 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
33644 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33645 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33646 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33648 ; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
33650 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33651 ; GFX8-NEXT: v_ffbh_u32_e32 v4, v1
33652 ; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
33653 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33654 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33655 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33656 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33657 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
33658 ; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
33659 ; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
33660 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
33661 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0
33662 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
33663 ; GFX8-NEXT: v_min_u32_e32 v6, 32, v0
33664 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
33665 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
33666 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33667 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33668 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33669 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33670 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
33671 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
33672 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
33673 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
33674 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
33675 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
33676 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
33677 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33678 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33679 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33680 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
33681 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33683 ; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
33685 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33686 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v1
33687 ; GFX9-NEXT: v_min_u32_e32 v4, 32, v4
33688 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33689 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33690 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33691 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33692 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33693 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
33694 ; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
33695 ; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
33696 ; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
33697 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
33698 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v0
33699 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
33700 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
33701 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33702 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33703 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33704 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33705 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
33706 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
33707 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
33708 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
33709 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
33710 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
33711 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33712 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33713 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33714 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
33715 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33717 ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
33719 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33720 ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1
33721 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
33722 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4
33723 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5
33724 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33725 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
33726 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
33727 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
33728 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
33729 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
33730 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4
33731 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5
33732 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33733 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33734 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
33735 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
33736 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
33737 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
33738 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
33739 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33740 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
33741 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33742 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33743 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33744 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33745 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33746 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33747 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33749 ; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
33751 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33752 ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1
33753 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3
33754 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33755 ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4
33756 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5
33757 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33758 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33759 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
33760 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33761 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
33762 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
33763 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33764 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
33765 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
33766 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4
33767 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5
33768 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
33769 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33770 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
33771 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33772 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
33773 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
33774 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33775 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
33776 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
33777 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
33778 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33779 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
33780 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33781 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33782 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
33783 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33784 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33785 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33786 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
33787 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33788 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33789 %op = uitofp <2 x i64> %x to <2 x bfloat>
33790 ret <2 x bfloat> %op
33793 define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
33794 ; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
33796 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33797 ; GCN-NEXT: v_ffbh_u32_e32 v6, v5
33798 ; GCN-NEXT: v_ffbh_u32_e32 v7, v3
33799 ; GCN-NEXT: v_ffbh_u32_e32 v8, v1
33800 ; GCN-NEXT: v_min_u32_e32 v6, 32, v6
33801 ; GCN-NEXT: v_min_u32_e32 v7, 32, v7
33802 ; GCN-NEXT: v_min_u32_e32 v8, 32, v8
33803 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
33804 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
33805 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
33806 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
33807 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
33808 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
33809 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
33810 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
33811 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
33812 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
33813 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
33814 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
33815 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4
33816 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
33817 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33818 ; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
33819 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
33820 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
33821 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33822 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33823 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
33824 ; GCN-NEXT: s_setpc_b64 s[30:31]
33826 ; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16:
33828 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33829 ; GFX7-NEXT: v_ffbh_u32_e32 v6, v5
33830 ; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
33831 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
33832 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
33833 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
33834 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
33835 ; GFX7-NEXT: v_ffbh_u32_e32 v6, v3
33836 ; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
33837 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
33838 ; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
33839 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
33840 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
33841 ; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
33842 ; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
33843 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
33844 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33845 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
33846 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
33847 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33848 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
33849 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
33850 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
33851 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
33852 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
33853 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33854 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33855 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
33856 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33858 ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
33860 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33861 ; GFX8-NEXT: v_ffbh_u32_e32 v6, v5
33862 ; GFX8-NEXT: v_min_u32_e32 v6, 32, v6
33863 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
33864 ; GFX8-NEXT: v_ffbh_u32_e32 v7, v1
33865 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
33866 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
33867 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
33868 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
33869 ; GFX8-NEXT: v_min_u32_e32 v7, 32, v7
33870 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
33871 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
33872 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
33873 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
33874 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
33875 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
33876 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33877 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
33878 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33879 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33880 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
33881 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33882 ; GFX8-NEXT: v_ffbh_u32_e32 v5, v3
33883 ; GFX8-NEXT: v_min_u32_e32 v5, 32, v5
33884 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
33885 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
33886 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7
33887 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
33888 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
33889 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
33890 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
33891 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
33892 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
33893 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
33894 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
33895 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33896 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
33897 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
33898 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
33899 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
33900 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
33901 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
33902 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
33903 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33904 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
33905 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
33906 ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
33907 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33909 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
33911 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33912 ; GFX9-NEXT: v_ffbh_u32_e32 v6, v5
33913 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
33914 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
33915 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33916 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
33917 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
33918 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
33919 ; GFX9-NEXT: v_ffbh_u32_e32 v6, v1
33920 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
33921 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
33922 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
33923 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33924 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33925 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33926 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
33927 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
33928 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
33929 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
33930 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33931 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
33932 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
33933 ; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
33934 ; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
33935 ; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
33936 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
33937 ; GFX9-NEXT: v_min_u32_e32 v7, 32, v0
33938 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
33939 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
33940 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33941 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33942 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33943 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
33944 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
33945 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
33946 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
33947 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
33948 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
33949 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
33950 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33951 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33952 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33953 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
33954 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
33955 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33957 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
33959 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33960 ; GFX10-NEXT: v_ffbh_u32_e32 v6, v1
33961 ; GFX10-NEXT: v_ffbh_u32_e32 v8, v3
33962 ; GFX10-NEXT: v_ffbh_u32_e32 v7, v5
33963 ; GFX10-NEXT: v_min_u32_e32 v6, 32, v6
33964 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
33965 ; GFX10-NEXT: v_min_u32_e32 v7, 32, v7
33966 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
33967 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
33968 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
33969 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v7
33970 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
33971 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
33972 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
33973 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
33974 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
33975 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6
33976 ; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
33977 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v8
33978 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33979 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
33980 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33981 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
33982 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v4
33983 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v7
33984 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
33985 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
33986 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
33987 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33988 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
33989 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33990 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
33991 ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
33992 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
33993 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
33994 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33995 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33996 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
33997 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33998 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
33999 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34000 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
34001 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34003 ; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16:
34004 ; GFX11TRUE16: ; %bb.0:
34005 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34006 ; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v6, v1
34007 ; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v7, v5
34008 ; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v8, v3
34009 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34010 ; GFX11TRUE16-NEXT: v_min_u32_e32 v6, 32, v6
34011 ; GFX11TRUE16-NEXT: v_min_u32_e32 v7, 32, v7
34012 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34013 ; GFX11TRUE16-NEXT: v_min_u32_e32 v8, 32, v8
34014 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
34015 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34016 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
34017 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
34018 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7
34019 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34020 ; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
34021 ; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
34022 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
34023 ; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
34024 ; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
34025 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34026 ; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v4
34027 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
34028 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
34029 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8
34030 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
34031 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
34032 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
34033 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34034 ; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
34035 ; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v7
34036 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34037 ; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
34038 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
34039 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34040 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
34041 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
34042 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
34043 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34044 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
34045 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
34046 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
34047 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
34048 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
34049 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
34050 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34051 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
34052 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
34053 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34054 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34055 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
34056 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34057 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
34058 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34060 ; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
34061 ; GFX11FAKE16: ; %bb.0:
34062 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34063 ; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v6, v1
34064 ; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v7, v5
34065 ; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v8, v3
34066 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34067 ; GFX11FAKE16-NEXT: v_min_u32_e32 v6, 32, v6
34068 ; GFX11FAKE16-NEXT: v_min_u32_e32 v7, 32, v7
34069 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34070 ; GFX11FAKE16-NEXT: v_min_u32_e32 v8, 32, v8
34071 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
34072 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34073 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
34074 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
34075 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7
34076 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34077 ; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
34078 ; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
34079 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
34080 ; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
34081 ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
34082 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34083 ; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
34084 ; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
34085 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
34086 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8
34087 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
34088 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
34089 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
34090 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34091 ; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
34092 ; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v7
34093 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34094 ; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
34095 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
34096 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34097 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
34098 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
34099 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
34100 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34101 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
34102 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
34103 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
34104 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
34105 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
34106 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
34107 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34108 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
34109 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
34110 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34111 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34112 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
34113 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
34114 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
34115 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34116 %op = uitofp <3 x i64> %x to <3 x bfloat>
34117 ret <3 x bfloat> %op
34120 define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
34121 ; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
34123 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34124 ; GCN-NEXT: v_ffbh_u32_e32 v8, v7
34125 ; GCN-NEXT: v_ffbh_u32_e32 v9, v5
34126 ; GCN-NEXT: v_ffbh_u32_e32 v10, v3
34127 ; GCN-NEXT: v_ffbh_u32_e32 v11, v1
34128 ; GCN-NEXT: v_min_u32_e32 v8, 32, v8
34129 ; GCN-NEXT: v_min_u32_e32 v9, 32, v9
34130 ; GCN-NEXT: v_min_u32_e32 v10, 32, v10
34131 ; GCN-NEXT: v_min_u32_e32 v11, 32, v11
34132 ; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
34133 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
34134 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
34135 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
34136 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
34137 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
34138 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
34139 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
34140 ; GCN-NEXT: v_min_u32_e32 v6, 1, v6
34141 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
34142 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
34143 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
34144 ; GCN-NEXT: v_or_b32_e32 v6, v7, v6
34145 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
34146 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
34147 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
34148 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6
34149 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4
34150 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
34151 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
34152 ; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
34153 ; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
34154 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
34155 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
34156 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34157 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34158 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
34159 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
34160 ; GCN-NEXT: s_setpc_b64 s[30:31]
34162 ; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16:
34164 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34165 ; GFX7-NEXT: v_ffbh_u32_e32 v8, v7
34166 ; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
34167 ; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
34168 ; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
34169 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
34170 ; GFX7-NEXT: v_cvt_f32_u32_e32 v6, v6
34171 ; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
34172 ; GFX7-NEXT: v_ffbh_u32_e32 v8, v5
34173 ; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
34174 ; GFX7-NEXT: v_ffbh_u32_e32 v7, v3
34175 ; GFX7-NEXT: v_min_u32_e32 v7, 32, v7
34176 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
34177 ; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
34178 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
34179 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
34180 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
34181 ; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
34182 ; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
34183 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
34184 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
34185 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
34186 ; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
34187 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
34188 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
34189 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
34190 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
34191 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
34192 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
34193 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
34194 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
34195 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
34196 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
34197 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34198 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34199 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
34200 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
34201 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34203 ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
34205 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34206 ; GFX8-NEXT: v_ffbh_u32_e32 v8, v5
34207 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
34208 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34209 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
34210 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
34211 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
34212 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
34213 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
34214 ; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
34215 ; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
34216 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
34217 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4
34218 ; GFX8-NEXT: v_ffbh_u32_e32 v4, v7
34219 ; GFX8-NEXT: v_min_u32_e32 v10, 32, v4
34220 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
34221 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
34222 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
34223 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
34224 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
34225 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
34226 ; GFX8-NEXT: v_ffbh_u32_e32 v8, v1
34227 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
34228 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
34229 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
34230 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
34231 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
34232 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
34233 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
34234 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
34235 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
34236 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
34237 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
34238 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
34239 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
34240 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
34241 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
34242 ; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
34243 ; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
34244 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
34245 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0
34246 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
34247 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v0
34248 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
34249 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6
34250 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
34251 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
34252 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
34253 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
34254 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
34255 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
34256 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
34257 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
34258 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
34259 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
34260 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
34261 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
34262 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
34263 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34264 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
34265 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
34266 ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
34267 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34269 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
34271 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34272 ; GFX9-NEXT: v_ffbh_u32_e32 v8, v5
34273 ; GFX9-NEXT: v_min_u32_e32 v8, 32, v8
34274 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34275 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
34276 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
34277 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
34278 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
34279 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
34280 ; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
34281 ; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
34282 ; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
34283 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v7
34284 ; GFX9-NEXT: v_min_u32_e32 v10, 32, v4
34285 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
34286 ; GFX9-NEXT: v_ffbh_u32_e32 v7, v1
34287 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
34288 ; GFX9-NEXT: v_min_u32_e32 v7, 32, v7
34289 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
34290 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
34291 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
34292 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
34293 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
34294 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
34295 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
34296 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
34297 ; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
34298 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
34299 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
34300 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
34301 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
34302 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
34303 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
34304 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
34305 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
34306 ; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
34307 ; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
34308 ; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
34309 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
34310 ; GFX9-NEXT: v_min_u32_e32 v8, 32, v0
34311 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
34312 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
34313 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
34314 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
34315 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
34316 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
34317 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
34318 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
34319 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
34320 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
34321 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
34322 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
34323 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
34324 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
34325 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
34326 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
34327 ; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
34328 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34330 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
34332 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34333 ; GFX10-NEXT: v_ffbh_u32_e32 v8, v5
34334 ; GFX10-NEXT: v_ffbh_u32_e32 v10, v1
34335 ; GFX10-NEXT: v_ffbh_u32_e32 v11, v3
34336 ; GFX10-NEXT: v_ffbh_u32_e32 v9, v7
34337 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
34338 ; GFX10-NEXT: v_min_u32_e32 v10, 32, v10
34339 ; GFX10-NEXT: v_min_u32_e32 v11, 32, v11
34340 ; GFX10-NEXT: v_min_u32_e32 v9, 32, v9
34341 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34342 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
34343 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
34344 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
34345 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8
34346 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 32, v9
34347 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
34348 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
34349 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
34350 ; GFX10-NEXT: v_min_u32_e32 v6, 1, v6
34351 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
34352 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
34353 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
34354 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v10
34355 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v11
34356 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v4
34357 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
34358 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
34359 ; GFX10-NEXT: v_or_b32_e32 v6, v7, v6
34360 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v8
34361 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
34362 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
34363 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v6
34364 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
34365 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
34366 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
34367 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34368 ; GFX10-NEXT: v_ldexp_f32 v4, v4, v9
34369 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
34370 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
34371 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
34372 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
34373 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
34374 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
34375 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34376 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
34377 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v1
34378 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
34379 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
34380 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
34381 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34382 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
34383 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
34384 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
34385 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
34386 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
34387 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34389 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
34391 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34392 ; GFX11-NEXT: v_clz_i32_u32_e32 v8, v5
34393 ; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1
34394 ; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3
34395 ; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7
34396 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34397 ; GFX11-NEXT: v_min_u32_e32 v8, 32, v8
34398 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10
34399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34400 ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11
34401 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9
34402 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34403 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34404 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
34405 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34406 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
34407 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
34408 ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 32, v8
34409 ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
34410 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
34411 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
34412 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
34413 ; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
34414 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34415 ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
34416 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
34417 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
34418 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
34419 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v10
34420 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v11
34421 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v4
34422 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
34423 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
34424 ; GFX11-NEXT: v_or_b32_e32 v6, v7, v6
34425 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34426 ; GFX11-NEXT: v_ldexp_f32 v2, v2, v8
34427 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5
34428 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34429 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
34430 ; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v6
34431 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
34432 ; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
34433 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
34434 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
34435 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
34436 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34437 ; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
34438 ; GFX11-NEXT: v_ldexp_f32 v4, v4, v9
34439 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
34440 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
34441 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
34442 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
34443 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34444 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
34445 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v1
34446 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
34447 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
34448 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34449 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
34450 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
34451 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
34452 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
34453 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
34454 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
34455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
34456 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
34457 ; GFX11-NEXT: s_setpc_b64 s[30:31]
34458 %op = uitofp <4 x i64> %x to <4 x bfloat>
34459 ret <4 x bfloat> %op
34462 define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
34463 ; GCN-LABEL: v_select_bf16:
34465 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34466 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
34467 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34468 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34469 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34470 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34471 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34472 ; GCN-NEXT: s_setpc_b64 s[30:31]
34474 ; GFX7-LABEL: v_select_bf16:
34476 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34477 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34478 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
34479 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34480 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34481 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34482 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34483 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34485 ; GFX8-LABEL: v_select_bf16:
34487 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34488 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34489 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34490 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34491 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34493 ; GFX9-LABEL: v_select_bf16:
34495 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34496 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34497 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34498 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34499 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34501 ; GFX10-LABEL: v_select_bf16:
34503 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34504 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34505 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34506 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34507 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34509 ; GFX11TRUE16-LABEL: v_select_bf16:
34510 ; GFX11TRUE16: ; %bb.0:
34511 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34512 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
34513 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
34514 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
34515 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34516 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
34517 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
34518 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34520 ; GFX11FAKE16-LABEL: v_select_bf16:
34521 ; GFX11FAKE16: ; %bb.0:
34522 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34523 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34524 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34525 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34526 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34527 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34528 %op = select i1 %cond, bfloat %a, bfloat %b
34532 define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
34533 ; GCN-LABEL: v_select_fneg_lhs_bf16:
34535 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34536 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34537 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34538 ; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
34539 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34540 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34541 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34542 ; GCN-NEXT: s_setpc_b64 s[30:31]
34544 ; GFX7-LABEL: v_select_fneg_lhs_bf16:
34546 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34547 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34548 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34549 ; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
34550 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34551 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34552 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34553 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34555 ; GFX8-LABEL: v_select_fneg_lhs_bf16:
34557 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34558 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34559 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34560 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34561 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34562 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34564 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
34566 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34567 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34568 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34569 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34570 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34571 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34573 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
34575 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34576 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34577 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34578 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34579 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34580 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34582 ; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
34583 ; GFX11TRUE16: ; %bb.0:
34584 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34585 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
34586 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
34587 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
34588 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34589 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
34590 ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
34591 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34592 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
34593 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34595 ; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
34596 ; GFX11FAKE16: ; %bb.0:
34597 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34598 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34599 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34600 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34601 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34602 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34603 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34604 %neg.a = fneg bfloat %a
34605 %op = select i1 %cond, bfloat %neg.a, bfloat %b
34609 define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
34610 ; GCN-LABEL: v_select_fneg_rhs_bf16:
34612 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34613 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
34614 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34615 ; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2
34616 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34617 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34618 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34619 ; GCN-NEXT: s_setpc_b64 s[30:31]
34621 ; GFX7-LABEL: v_select_fneg_rhs_bf16:
34623 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34624 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34625 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
34626 ; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2
34627 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34628 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34629 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34630 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34632 ; GFX8-LABEL: v_select_fneg_rhs_bf16:
34634 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34635 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34636 ; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34637 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34638 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34639 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34641 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
34643 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34644 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34645 ; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34646 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34647 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34648 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34650 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
34652 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34653 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34654 ; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34655 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34656 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34657 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34659 ; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
34660 ; GFX11TRUE16: ; %bb.0:
34661 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34662 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
34663 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
34664 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
34665 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34666 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
34667 ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
34668 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34669 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
34670 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34672 ; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
34673 ; GFX11FAKE16: ; %bb.0:
34674 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34675 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34676 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34677 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34678 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34679 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34680 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34681 %neg.b = fneg bfloat %b
34682 %op = select i1 %cond, bfloat %a, bfloat %neg.b
34686 define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
34687 ; GCN-LABEL: v_select_v2bf16:
34689 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34690 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
34691 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
34692 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34693 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
34694 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34695 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34696 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34697 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34698 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34699 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34700 ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
34701 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
34702 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
34703 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
34704 ; GCN-NEXT: s_setpc_b64 s[30:31]
34706 ; GFX7-LABEL: v_select_v2bf16:
34708 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34709 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
34710 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
34711 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34712 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
34713 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34714 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34715 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34716 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34717 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34718 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34719 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
34720 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
34721 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
34722 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
34723 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34725 ; GFX8-LABEL: v_select_v2bf16:
34727 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34728 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34729 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34730 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34731 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34732 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34733 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
34734 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
34735 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34736 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34738 ; GFX9-LABEL: v_select_v2bf16:
34740 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34741 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34742 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34743 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34744 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34745 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34746 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
34747 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
34748 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
34749 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34751 ; GFX10-LABEL: v_select_v2bf16:
34753 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34754 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34755 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
34756 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34757 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34758 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34759 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo
34760 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34761 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34763 ; GFX11TRUE16-LABEL: v_select_v2bf16:
34764 ; GFX11TRUE16: ; %bb.0:
34765 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34766 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34767 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
34768 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34769 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34770 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34771 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo
34772 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
34773 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34774 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
34775 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
34776 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34777 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34778 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34780 ; GFX11FAKE16-LABEL: v_select_v2bf16:
34781 ; GFX11FAKE16: ; %bb.0:
34782 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34783 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
34784 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34785 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34786 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
34787 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34788 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
34789 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34790 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34791 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34792 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
34793 ret <2 x bfloat> %op
34796 define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
34797 ; GCN-LABEL: v_vselect_v2bf16:
34799 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34800 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34801 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
34802 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34803 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
34804 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
34805 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
34806 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34807 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
34808 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34809 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
34810 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34811 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34812 ; GCN-NEXT: s_setpc_b64 s[30:31]
34814 ; GFX7-LABEL: v_vselect_v2bf16:
34816 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34817 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
34818 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34819 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
34820 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
34821 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34822 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34823 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
34824 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
34825 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34826 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
34827 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34828 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34829 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34831 ; GFX8-LABEL: v_vselect_v2bf16:
34833 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34834 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34835 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
34836 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34837 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
34838 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34839 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34840 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34841 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
34842 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
34843 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34844 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34846 ; GFX9-LABEL: v_vselect_v2bf16:
34848 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34849 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34850 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
34851 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34852 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
34853 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34854 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34855 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34856 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
34857 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
34858 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
34859 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34861 ; GFX10-LABEL: v_vselect_v2bf16:
34863 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34864 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34865 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
34866 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34867 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
34868 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34869 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
34870 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
34871 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
34872 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34873 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34875 ; GFX11TRUE16-LABEL: v_vselect_v2bf16:
34876 ; GFX11TRUE16: ; %bb.0:
34877 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34878 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
34879 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34880 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34881 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
34882 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34883 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
34884 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
34885 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34886 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo
34887 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v2.l, s0
34888 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34889 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
34890 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
34891 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34892 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34893 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34895 ; GFX11FAKE16-LABEL: v_vselect_v2bf16:
34896 ; GFX11FAKE16: ; %bb.0:
34897 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34898 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34899 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34900 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
34901 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
34902 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34903 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
34904 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
34905 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
34906 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
34907 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34908 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34909 %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
34910 ret <2 x bfloat> %op
34913 define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
34914 ; GCN-LABEL: s_select_bf16:
34916 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
34917 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
34918 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34919 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34920 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
34921 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
34922 ; GCN-NEXT: ; return to shader part epilog
34924 ; GFX7-LABEL: s_select_bf16:
34926 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
34927 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
34928 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34929 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34930 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
34931 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
34932 ; GFX7-NEXT: ; return to shader part epilog
34934 ; GFX8-LABEL: s_select_bf16:
34936 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
34937 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
34938 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34939 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
34940 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
34941 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
34942 ; GFX8-NEXT: ; return to shader part epilog
34944 ; GFX9-LABEL: s_select_bf16:
34946 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
34947 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
34948 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34949 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
34950 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
34951 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
34952 ; GFX9-NEXT: ; return to shader part epilog
34954 ; GFX10-LABEL: s_select_bf16:
34956 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
34957 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
34958 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
34959 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
34960 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
34961 ; GFX10-NEXT: ; return to shader part epilog
34963 ; GFX11TRUE16-LABEL: s_select_bf16:
34964 ; GFX11TRUE16: ; %bb.0:
34965 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
34966 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
34967 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
34968 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
34969 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
34970 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34971 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
34972 ; GFX11TRUE16-NEXT: ; return to shader part epilog
34974 ; GFX11FAKE16-LABEL: s_select_bf16:
34975 ; GFX11FAKE16: ; %bb.0:
34976 ; GFX11FAKE16-NEXT: v_mov_b32_e32 v1, s0
34977 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
34978 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
34979 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
34980 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
34981 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34982 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
34983 ; GFX11FAKE16-NEXT: ; return to shader part epilog
34984 %cond = icmp eq i32 %c, 0
34985 %op = select i1 %cond, bfloat %a, bfloat %b
34986 %cast = bitcast bfloat %op to i16
34987 %zext = zext i16 %cast to i32
34988 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
34992 define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
34993 ; GCN-LABEL: s_select_v2bf16:
34995 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
34996 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3
34997 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0
34998 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s2
34999 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
35000 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35001 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35002 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35003 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35004 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
35005 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35006 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35007 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
35008 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
35009 ; GCN-NEXT: ; return to shader part epilog
35011 ; GFX7-LABEL: s_select_v2bf16:
35013 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
35014 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3
35015 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
35016 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35017 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0
35018 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
35019 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35020 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35021 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35022 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35023 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
35024 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35025 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
35026 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
35027 ; GFX7-NEXT: ; return to shader part epilog
35029 ; GFX8-LABEL: s_select_v2bf16:
35031 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
35032 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
35033 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
35034 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
35035 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35036 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
35037 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
35038 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
35039 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35040 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
35041 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
35042 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
35043 ; GFX8-NEXT: ; return to shader part epilog
35045 ; GFX9-LABEL: s_select_v2bf16:
35047 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
35048 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
35049 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
35050 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
35051 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35052 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
35053 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
35054 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
35055 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
35056 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
35057 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0
35058 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
35059 ; GFX9-NEXT: ; return to shader part epilog
35061 ; GFX10-LABEL: s_select_v2bf16:
35063 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
35064 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
35065 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
35066 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35067 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
35068 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
35069 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo
35070 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35071 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
35072 ; GFX10-NEXT: ; return to shader part epilog
35074 ; GFX11TRUE16-LABEL: s_select_v2bf16:
35075 ; GFX11TRUE16: ; %bb.0:
35076 ; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
35077 ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
35078 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35079 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
35080 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
35081 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
35082 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
35083 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
35084 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
35085 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
35086 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35087 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
35088 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
35089 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
35090 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35091 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
35092 ; GFX11TRUE16-NEXT: ; return to shader part epilog
35094 ; GFX11FAKE16-LABEL: s_select_v2bf16:
35095 ; GFX11FAKE16: ; %bb.0:
35096 ; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16
35097 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35098 ; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
35099 ; GFX11FAKE16-NEXT: s_lshr_b32 s3, s1, 16
35100 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
35101 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
35102 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
35103 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo
35104 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35105 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35106 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
35107 ; GFX11FAKE16-NEXT: ; return to shader part epilog
35108 %cond = icmp eq i32 %c, 0
35109 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
35110 %cast = bitcast <2 x bfloat> %op to i32
35111 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
35115 define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
35116 ; GCN-LABEL: s_vselect_v2bf16:
35118 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
35119 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2
35120 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1
35121 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
35122 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35123 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
35124 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35125 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
35126 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35127 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35128 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
35129 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
35130 ; GCN-NEXT: ; return to shader part epilog
35132 ; GFX7-LABEL: s_vselect_v2bf16:
35134 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1
35135 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3
35136 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35137 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
35138 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
35139 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
35140 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35141 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
35142 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35143 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35144 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
35145 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
35146 ; GFX7-NEXT: ; return to shader part epilog
35148 ; GFX8-LABEL: s_vselect_v2bf16:
35150 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
35151 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
35152 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
35153 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
35154 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35155 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
35156 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
35157 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
35158 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35159 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35160 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
35161 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
35162 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
35163 ; GFX8-NEXT: ; return to shader part epilog
35165 ; GFX9-LABEL: s_vselect_v2bf16:
35167 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
35168 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
35169 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
35170 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
35171 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35172 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
35173 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
35174 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
35175 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35176 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
35177 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
35178 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
35179 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
35180 ; GFX9-NEXT: ; return to shader part epilog
35182 ; GFX10-LABEL: s_vselect_v2bf16:
35184 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
35185 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
35186 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
35187 ; GFX10-NEXT: v_mov_b32_e32 v3, s0
35188 ; GFX10-NEXT: s_lshr_b32 s0, s1, 16
35189 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
35190 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35191 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo
35192 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35193 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
35194 ; GFX10-NEXT: ; return to shader part epilog
35196 ; GFX11TRUE16-LABEL: s_vselect_v2bf16:
35197 ; GFX11TRUE16: ; %bb.0:
35198 ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
35199 ; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
35200 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35201 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
35202 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
35203 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
35204 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
35205 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
35206 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
35207 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s2
35208 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
35209 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35210 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
35211 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
35212 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
35213 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35214 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
35215 ; GFX11TRUE16-NEXT: ; return to shader part epilog
35217 ; GFX11FAKE16-LABEL: s_vselect_v2bf16:
35218 ; GFX11FAKE16: ; %bb.0:
35219 ; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16
35220 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
35221 ; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
35222 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16
35223 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
35224 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
35225 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35226 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
35227 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo
35228 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35229 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35230 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
35231 ; GFX11FAKE16-NEXT: ; return to shader part epilog
35232 %cond = icmp eq <2 x i32> %c, zeroinitializer
35233 %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
35234 %cast = bitcast <2 x bfloat> %op to i32
35235 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
35239 define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
35240 ; GCN-LABEL: v_select_v3bf16:
35242 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35243 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35244 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35245 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35246 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35247 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35248 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35249 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35250 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35251 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35252 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35253 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
35254 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35255 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
35256 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35257 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35258 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
35259 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
35260 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35261 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35262 ; GCN-NEXT: s_setpc_b64 s[30:31]
35264 ; GFX7-LABEL: v_select_v3bf16:
35266 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35267 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35268 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35269 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35270 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35271 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
35272 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35273 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35274 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35275 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35276 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35277 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35278 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35279 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
35280 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35281 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35282 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
35283 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
35284 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35285 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35286 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35288 ; GFX8-LABEL: v_select_v3bf16:
35290 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35291 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35292 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35293 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35294 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35295 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35297 ; GFX9-LABEL: v_select_v3bf16:
35299 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35300 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35301 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35302 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35303 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35304 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35306 ; GFX10-LABEL: v_select_v3bf16:
35308 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35309 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35310 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35311 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
35312 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
35313 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35315 ; GFX11-LABEL: v_select_v3bf16:
35317 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35318 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35319 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35320 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35321 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
35322 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35323 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
35324 ret <3 x bfloat> %op
35327 define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
35328 ; GCN-LABEL: v_select_v4bf16:
35330 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35331 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35332 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35333 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35334 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35335 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35336 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35337 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35338 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35339 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35340 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35341 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35342 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35343 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35344 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35345 ; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16
35346 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35347 ; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16
35348 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35349 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35350 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35351 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35352 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35353 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35354 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35355 ; GCN-NEXT: s_setpc_b64 s[30:31]
35357 ; GFX7-LABEL: v_select_v4bf16:
35359 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35360 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35361 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35362 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35363 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35364 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35365 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6
35366 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35367 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35368 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35369 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35370 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35371 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
35372 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
35373 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35374 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
35375 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35376 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
35377 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35378 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35379 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35380 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35381 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35382 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35383 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35384 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35386 ; GFX8-LABEL: v_select_v4bf16:
35388 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35389 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35390 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35391 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35392 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35393 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35395 ; GFX9-LABEL: v_select_v4bf16:
35397 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35398 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35399 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35400 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35401 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35402 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35404 ; GFX10-LABEL: v_select_v4bf16:
35406 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35407 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35408 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35409 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
35410 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
35411 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35413 ; GFX11-LABEL: v_select_v4bf16:
35415 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35416 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35417 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35418 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35419 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
35420 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35421 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
35422 ret <4 x bfloat> %op
35425 define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
35426 ; GCN-LABEL: v_select_v6bf16:
35428 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35429 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35430 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35431 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35432 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35433 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35434 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35435 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
35436 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
35437 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35438 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35439 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
35440 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
35441 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35442 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35443 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35444 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35445 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35446 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35447 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35448 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35449 ; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16
35450 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35451 ; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16
35452 ; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
35453 ; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16
35454 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35455 ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35456 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35457 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35458 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35459 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35460 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35461 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35462 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35463 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35464 ; GCN-NEXT: s_setpc_b64 s[30:31]
35466 ; GFX7-LABEL: v_select_v6bf16:
35468 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35469 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35470 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35471 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35472 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35473 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35474 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
35475 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35476 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35477 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35478 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35479 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
35480 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35481 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10
35482 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35483 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35484 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
35485 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35486 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9
35487 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
35488 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12
35489 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
35490 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35491 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
35492 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35493 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
35494 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35495 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35496 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35497 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35498 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35499 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35500 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35501 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35502 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35503 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35504 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35506 ; GFX8-LABEL: v_select_v6bf16:
35508 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35509 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35510 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35511 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
35512 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
35513 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
35514 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35516 ; GFX9-LABEL: v_select_v6bf16:
35518 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35519 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35520 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35521 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
35522 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
35523 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
35524 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35526 ; GFX10-LABEL: v_select_v6bf16:
35528 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35529 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35530 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35531 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
35532 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
35533 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
35534 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35536 ; GFX11-LABEL: v_select_v6bf16:
35538 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35539 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35540 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35541 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35542 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
35543 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
35544 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35545 %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
35546 ret <6 x bfloat> %op
35549 define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
35550 ; GCN-LABEL: v_select_v8bf16:
35552 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35553 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35554 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35555 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
35556 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
35557 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35558 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35559 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
35560 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
35561 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35562 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35563 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
35564 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
35565 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35566 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35567 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
35568 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
35569 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35570 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35571 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35572 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35573 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35574 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35575 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35576 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35577 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35578 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35579 ; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16
35580 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35581 ; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16
35582 ; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
35583 ; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16
35584 ; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
35585 ; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16
35586 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35587 ; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35588 ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35589 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35590 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35591 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35592 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35593 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35594 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35595 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35596 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35597 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35598 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35599 ; GCN-NEXT: s_setpc_b64 s[30:31]
35601 ; GFX7-LABEL: v_select_v8bf16:
35603 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35604 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35605 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35606 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35607 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35608 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35609 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10
35610 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35611 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35612 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35613 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35614 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
35615 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35616 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12
35617 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35618 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35619 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
35620 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16
35621 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35622 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11
35623 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
35624 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14
35625 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35626 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
35627 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16
35628 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35629 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13
35630 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
35631 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16
35632 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16
35633 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35634 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15
35635 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35636 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
35637 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35638 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35639 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35640 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35641 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35642 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35643 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35644 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35645 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35646 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35647 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35648 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35649 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35650 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35652 ; GFX8-LABEL: v_select_v8bf16:
35654 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35655 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35656 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35657 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
35658 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
35659 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
35660 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
35661 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35663 ; GFX9-LABEL: v_select_v8bf16:
35665 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35666 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35667 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35668 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
35669 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
35670 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
35671 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
35672 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35674 ; GFX10-LABEL: v_select_v8bf16:
35676 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35677 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35678 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35679 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo
35680 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
35681 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc_lo
35682 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc_lo
35683 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35685 ; GFX11-LABEL: v_select_v8bf16:
35687 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35688 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35689 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35690 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35691 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
35692 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
35693 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35694 %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
35695 ret <8 x bfloat> %op
35698 define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
35699 ; GCN-LABEL: v_select_v16bf16:
35701 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35702 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35703 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35704 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35705 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35706 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
35707 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
35708 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35709 ; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16
35710 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35711 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35712 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35713 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35714 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35715 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20
35716 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
35717 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35718 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35719 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
35720 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
35721 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35722 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35723 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24
35724 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23
35725 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
35726 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
35727 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
35728 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25
35729 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
35730 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
35731 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28
35732 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27
35733 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
35734 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
35735 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30
35736 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
35737 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
35738 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
35739 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35740 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35741 ; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16
35742 ; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
35743 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
35744 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
35745 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
35746 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35747 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
35748 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35749 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
35750 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35751 ; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
35752 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35753 ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
35754 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35755 ; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
35756 ; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
35757 ; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16
35758 ; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16
35759 ; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16
35760 ; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
35761 ; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16
35762 ; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
35763 ; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16
35764 ; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
35765 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35766 ; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
35767 ; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
35768 ; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
35769 ; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35770 ; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
35771 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35772 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35773 ; GCN-NEXT: s_waitcnt vmcnt(1)
35774 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6
35775 ; GCN-NEXT: s_waitcnt vmcnt(0)
35776 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
35777 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35778 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35779 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35780 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35781 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35782 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35783 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35784 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35785 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
35786 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
35787 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
35788 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
35789 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
35790 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
35791 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35792 ; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16
35793 ; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc
35794 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
35795 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
35796 ; GCN-NEXT: s_setpc_b64 s[30:31]
35798 ; GFX7-LABEL: v_select_v16bf16:
35800 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35801 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35802 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35803 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35804 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35805 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35806 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
35807 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35808 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35809 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35810 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
35811 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35812 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
35813 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
35814 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35815 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
35816 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35817 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
35818 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35819 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35820 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
35821 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
35822 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
35823 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
35824 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22
35825 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35826 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
35827 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
35828 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
35829 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21
35830 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
35831 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
35832 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35833 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
35834 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
35835 ; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16
35836 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35837 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23
35838 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
35839 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
35840 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35841 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
35842 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
35843 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
35844 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16
35845 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35846 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25
35847 ; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
35848 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28
35849 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35850 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
35851 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35852 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
35853 ; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16
35854 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35855 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
35856 ; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
35857 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
35858 ; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
35859 ; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16
35860 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35861 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
35862 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35863 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
35864 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35865 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
35866 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
35867 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
35868 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35869 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
35870 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35871 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35872 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35873 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35874 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35875 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35876 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35877 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35878 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
35879 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
35880 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
35881 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
35882 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
35883 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
35884 ; GFX7-NEXT: s_waitcnt vmcnt(1)
35885 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17
35886 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35887 ; GFX7-NEXT: s_waitcnt vmcnt(0)
35888 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35889 ; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16
35890 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc
35891 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35892 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35893 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
35894 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
35895 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35897 ; GFX8-LABEL: v_select_v16bf16:
35899 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35900 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35901 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35902 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
35903 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
35904 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
35905 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
35906 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
35907 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
35908 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
35909 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
35910 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35912 ; GFX9-LABEL: v_select_v16bf16:
35914 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35915 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35916 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35917 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
35918 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
35919 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
35920 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
35921 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
35922 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
35923 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
35924 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
35925 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35927 ; GFX10-LABEL: v_select_v16bf16:
35929 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35930 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35931 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35932 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc_lo
35933 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc_lo
35934 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc_lo
35935 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc_lo
35936 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc_lo
35937 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc_lo
35938 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo
35939 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc_lo
35940 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35942 ; GFX11-LABEL: v_select_v16bf16:
35944 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35945 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35946 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35947 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35948 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
35949 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
35950 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
35951 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
35952 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35953 %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
35954 ret <16 x bfloat> %op
35957 define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
35958 ; GCN-LABEL: v_select_v32bf16:
35960 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35961 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35962 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35963 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
35964 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35965 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35966 ; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
35967 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4
35968 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3
35969 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
35970 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
35971 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6
35972 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
35973 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35974 ; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
35975 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8
35976 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
35977 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35978 ; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
35979 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
35980 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9
35981 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35982 ; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
35983 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12
35984 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
35985 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
35986 ; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
35987 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14
35988 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13
35989 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35990 ; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
35991 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16
35992 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15
35993 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
35994 ; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
35995 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
35996 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17
35997 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35998 ; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
35999 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20
36000 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19
36001 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
36002 ; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
36003 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12
36004 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22
36005 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21
36006 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
36007 ; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16
36008 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
36009 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24
36010 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23
36011 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
36012 ; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16
36013 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
36014 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26
36015 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25
36016 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
36017 ; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16
36018 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
36019 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28
36020 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27
36021 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
36022 ; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16
36023 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
36024 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30
36025 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
36026 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
36027 ; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16
36028 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24
36029 ; GCN-NEXT: s_waitcnt vmcnt(5)
36030 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
36031 ; GCN-NEXT: s_waitcnt vmcnt(4)
36032 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
36033 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
36034 ; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
36035 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
36036 ; GCN-NEXT: s_waitcnt vmcnt(4)
36037 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
36038 ; GCN-NEXT: s_waitcnt vmcnt(3)
36039 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
36040 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
36041 ; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
36042 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
36043 ; GCN-NEXT: s_waitcnt vmcnt(3)
36044 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
36045 ; GCN-NEXT: s_waitcnt vmcnt(2)
36046 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
36047 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
36048 ; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16
36049 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44
36050 ; GCN-NEXT: s_waitcnt vmcnt(2)
36051 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
36052 ; GCN-NEXT: s_waitcnt vmcnt(1)
36053 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
36054 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40
36055 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36056 ; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16
36057 ; GCN-NEXT: s_waitcnt vmcnt(1)
36058 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
36059 ; GCN-NEXT: s_waitcnt vmcnt(0)
36060 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
36061 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52
36062 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48
36063 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
36064 ; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16
36065 ; GCN-NEXT: s_waitcnt vmcnt(1)
36066 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
36067 ; GCN-NEXT: s_waitcnt vmcnt(0)
36068 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
36069 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
36070 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56
36071 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36072 ; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16
36073 ; GCN-NEXT: s_waitcnt vmcnt(1)
36074 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
36075 ; GCN-NEXT: s_waitcnt vmcnt(0)
36076 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
36077 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68
36078 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
36079 ; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
36080 ; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16
36081 ; GCN-NEXT: s_waitcnt vmcnt(1)
36082 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
36083 ; GCN-NEXT: s_waitcnt vmcnt(0)
36084 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
36085 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76
36086 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
36087 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
36088 ; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16
36089 ; GCN-NEXT: s_waitcnt vmcnt(1)
36090 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
36091 ; GCN-NEXT: s_waitcnt vmcnt(0)
36092 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
36093 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84
36094 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
36095 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
36096 ; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16
36097 ; GCN-NEXT: s_waitcnt vmcnt(1)
36098 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
36099 ; GCN-NEXT: s_waitcnt vmcnt(0)
36100 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
36101 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
36102 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
36103 ; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
36104 ; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16
36105 ; GCN-NEXT: s_waitcnt vmcnt(1)
36106 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
36107 ; GCN-NEXT: s_waitcnt vmcnt(0)
36108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
36109 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100
36110 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
36111 ; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
36112 ; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16
36113 ; GCN-NEXT: s_waitcnt vmcnt(1)
36114 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
36115 ; GCN-NEXT: s_waitcnt vmcnt(0)
36116 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
36117 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108
36118 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
36119 ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
36120 ; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16
36121 ; GCN-NEXT: s_waitcnt vmcnt(1)
36122 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
36123 ; GCN-NEXT: s_waitcnt vmcnt(0)
36124 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
36125 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
36126 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112
36127 ; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
36128 ; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16
36129 ; GCN-NEXT: s_waitcnt vmcnt(1)
36130 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
36131 ; GCN-NEXT: s_waitcnt vmcnt(0)
36132 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
36133 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
36134 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
36135 ; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
36136 ; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16
36137 ; GCN-NEXT: s_waitcnt vmcnt(1)
36138 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
36139 ; GCN-NEXT: s_waitcnt vmcnt(0)
36140 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
36141 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
36142 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
36143 ; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
36144 ; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16
36145 ; GCN-NEXT: s_waitcnt vmcnt(1)
36146 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
36147 ; GCN-NEXT: s_waitcnt vmcnt(0)
36148 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
36149 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
36150 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
36151 ; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
36152 ; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16
36153 ; GCN-NEXT: s_waitcnt vmcnt(1)
36154 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
36155 ; GCN-NEXT: s_waitcnt vmcnt(0)
36156 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
36157 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
36158 ; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
36159 ; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc
36160 ; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc
36161 ; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc
36162 ; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc
36163 ; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc
36164 ; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc
36165 ; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc
36166 ; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc
36167 ; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc
36168 ; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc
36169 ; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc
36170 ; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc
36171 ; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc
36172 ; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc
36173 ; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
36174 ; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc
36175 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
36176 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36177 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
36178 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36179 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
36180 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
36181 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
36182 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
36183 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
36184 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
36185 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
36186 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
36187 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
36188 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
36189 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
36190 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22
36191 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23
36192 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
36193 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24
36194 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v24
36195 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25
36196 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v25
36197 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
36198 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v26
36199 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v27
36200 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v27
36201 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v28
36202 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
36203 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29
36204 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
36205 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
36206 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
36207 ; GCN-NEXT: s_setpc_b64 s[30:31]
36209 ; GFX7-LABEL: v_select_v32bf16:
36211 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36212 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
36213 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36214 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
36215 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
36216 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4
36217 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36218 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
36219 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
36220 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
36221 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36222 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
36223 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
36224 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
36225 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
36226 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
36227 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
36228 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10
36229 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
36230 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
36231 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
36232 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
36233 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
36234 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
36235 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
36236 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
36237 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
36238 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
36239 ; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
36240 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
36241 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
36242 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
36243 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
36244 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
36245 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
36246 ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
36247 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
36248 ; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
36249 ; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16
36250 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
36251 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
36252 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
36253 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
36254 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
36255 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
36256 ; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
36257 ; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16
36258 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
36259 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
36260 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
36261 ; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
36262 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
36263 ; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36264 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
36265 ; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16
36266 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
36267 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
36268 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
36269 ; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
36270 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
36271 ; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
36272 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
36273 ; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
36274 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
36275 ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
36276 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
36277 ; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
36278 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
36279 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36280 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
36281 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
36282 ; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
36283 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
36284 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
36285 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
36286 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
36287 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
36288 ; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
36289 ; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
36290 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
36291 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
36292 ; GFX7-NEXT: s_waitcnt vmcnt(14)
36293 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
36294 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
36295 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
36296 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
36297 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36298 ; GFX7-NEXT: s_waitcnt vmcnt(13)
36299 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
36300 ; GFX7-NEXT: s_waitcnt vmcnt(12)
36301 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
36302 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
36303 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20
36304 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
36305 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
36306 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
36307 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36308 ; GFX7-NEXT: s_waitcnt vmcnt(12)
36309 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
36310 ; GFX7-NEXT: s_waitcnt vmcnt(11)
36311 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
36312 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
36313 ; GFX7-NEXT: s_waitcnt vmcnt(9)
36314 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
36315 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
36316 ; GFX7-NEXT: s_waitcnt vmcnt(7)
36317 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
36318 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
36319 ; GFX7-NEXT: s_waitcnt vmcnt(6)
36320 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
36321 ; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36322 ; GFX7-NEXT: s_waitcnt vmcnt(5)
36323 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
36324 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
36325 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
36326 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
36327 ; GFX7-NEXT: s_waitcnt vmcnt(4)
36328 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
36329 ; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
36330 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
36331 ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
36332 ; GFX7-NEXT: s_waitcnt vmcnt(3)
36333 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
36334 ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
36335 ; GFX7-NEXT: s_waitcnt vmcnt(1)
36336 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
36337 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
36338 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36339 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
36340 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
36341 ; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
36342 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28
36343 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36344 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
36345 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
36346 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
36347 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36
36348 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36349 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
36350 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
36351 ; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
36352 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
36353 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc
36354 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9
36355 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36356 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
36357 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
36358 ; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16
36359 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
36360 ; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc
36361 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
36362 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
36363 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
36364 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36365 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
36366 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
36367 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
36368 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
36369 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
36370 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36371 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36372 ; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16
36373 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
36374 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
36375 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
36376 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
36377 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36378 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36379 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16
36380 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
36381 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
36382 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
36383 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
36384 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36385 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36386 ; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16
36387 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
36388 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
36389 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
36390 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
36391 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36392 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36393 ; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16
36394 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
36395 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
36396 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
36397 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
36398 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36399 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36400 ; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16
36401 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
36402 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
36403 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
36404 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
36405 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36406 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36407 ; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16
36408 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
36409 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
36410 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
36411 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
36412 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36413 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36414 ; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16
36415 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
36416 ; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
36417 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
36418 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
36419 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36420 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36421 ; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16
36422 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
36423 ; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc
36424 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
36425 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
36426 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36427 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36428 ; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16
36429 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
36430 ; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc
36431 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
36432 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
36433 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36434 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36435 ; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16
36436 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
36437 ; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc
36438 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29
36439 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
36440 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36441 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36442 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
36443 ; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
36444 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
36445 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36446 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
36447 ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
36448 ; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16
36449 ; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
36450 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
36451 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
36452 ; GFX7-NEXT: s_setpc_b64 s[30:31]
36454 ; GFX8-LABEL: v_select_v32bf16:
36456 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36457 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
36458 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36459 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
36460 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
36461 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32
36462 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
36463 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
36464 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
36465 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
36466 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
36467 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
36468 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
36469 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
36470 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
36471 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
36472 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
36473 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
36474 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
36475 ; GFX8-NEXT: s_waitcnt vmcnt(1)
36476 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
36477 ; GFX8-NEXT: s_waitcnt vmcnt(0)
36478 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
36479 ; GFX8-NEXT: s_setpc_b64 s[30:31]
36481 ; GFX9-LABEL: v_select_v32bf16:
36483 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36484 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
36485 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36486 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
36487 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
36488 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32
36489 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
36490 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
36491 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
36492 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
36493 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
36494 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
36495 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
36496 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
36497 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
36498 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
36499 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
36500 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
36501 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
36502 ; GFX9-NEXT: s_waitcnt vmcnt(1)
36503 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
36504 ; GFX9-NEXT: s_waitcnt vmcnt(0)
36505 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
36506 ; GFX9-NEXT: s_setpc_b64 s[30:31]
36508 ; GFX10-LABEL: v_select_v32bf16:
36510 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36511 ; GFX10-NEXT: s_clause 0x1
36512 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
36513 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
36514 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
36515 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
36516 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc_lo
36517 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc_lo
36518 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc_lo
36519 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc_lo
36520 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc_lo
36521 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc_lo
36522 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc_lo
36523 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc_lo
36524 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc_lo
36525 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc_lo
36526 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc_lo
36527 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc_lo
36528 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc_lo
36529 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc_lo
36530 ; GFX10-NEXT: s_waitcnt vmcnt(1)
36531 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc_lo
36532 ; GFX10-NEXT: s_waitcnt vmcnt(0)
36533 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc_lo
36534 ; GFX10-NEXT: s_setpc_b64 s[30:31]
36536 ; GFX11-LABEL: v_select_v32bf16:
36538 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36539 ; GFX11-NEXT: s_clause 0x1
36540 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
36541 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
36542 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
36543 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
36544 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
36545 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
36546 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
36547 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
36548 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
36549 ; GFX11-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
36550 ; GFX11-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
36551 ; GFX11-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
36552 ; GFX11-NEXT: s_waitcnt vmcnt(0)
36553 ; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
36554 ; GFX11-NEXT: s_setpc_b64 s[30:31]
36555 %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
36556 ret <32 x bfloat> %op
36559 define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
36560 ; GCN-LABEL: s_select_v3bf16:
36562 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
36563 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
36564 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4
36565 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3
36566 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
36567 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5
36568 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36569 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36570 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
36571 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
36572 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
36573 ; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
36574 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36575 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
36576 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36577 ; GCN-NEXT: v_readfirstlane_b32 s0, v1
36578 ; GCN-NEXT: v_readfirstlane_b32 s1, v0
36579 ; GCN-NEXT: ; return to shader part epilog
36581 ; GFX7-LABEL: s_select_v3bf16:
36583 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
36584 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36585 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
36586 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
36587 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
36588 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36589 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
36590 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
36591 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
36592 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
36593 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36594 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
36595 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36596 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
36597 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36598 ; GFX7-NEXT: v_readfirstlane_b32 s0, v1
36599 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0
36600 ; GFX7-NEXT: ; return to shader part epilog
36602 ; GFX8-LABEL: s_select_v3bf16:
36604 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
36605 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
36606 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36607 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36608 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
36609 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
36610 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36611 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
36612 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
36613 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
36614 ; GFX8-NEXT: ; return to shader part epilog
36616 ; GFX9-LABEL: s_select_v3bf16:
36618 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
36619 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
36620 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36621 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36622 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
36623 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
36624 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36625 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
36626 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
36627 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
36628 ; GFX9-NEXT: ; return to shader part epilog
36630 ; GFX10-LABEL: s_select_v3bf16:
36632 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
36633 ; GFX10-NEXT: v_mov_b32_e32 v2, s1
36634 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36635 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo
36636 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo
36637 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
36638 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
36639 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
36640 ; GFX10-NEXT: ; return to shader part epilog
36642 ; GFX11-LABEL: s_select_v3bf16:
36644 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
36645 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36646 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
36647 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo
36648 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo
36649 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36650 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
36651 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
36652 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
36653 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
36654 ; GFX11-NEXT: ; return to shader part epilog
36655 %cond = icmp eq i32 %c, 0
36656 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
36657 %cast = bitcast <3 x bfloat> %op to i48
36658 %elt0 = trunc i48 %cast to i32
36659 %elt1.hi = lshr i48 %cast, 32
36660 %elt1 = trunc i48 %elt1.hi to i32
36661 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36662 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36663 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36664 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36665 ret <2 x i32> %bv.1
36668 define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
36669 ; GCN-LABEL: s_select_v4bf16:
36671 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
36672 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
36673 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5
36674 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4
36675 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
36676 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2
36677 ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7
36678 ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6
36679 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36680 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36681 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
36682 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
36683 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
36684 ; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
36685 ; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16
36686 ; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16
36687 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36688 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
36689 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36690 ; GCN-NEXT: v_readfirstlane_b32 s0, v1
36691 ; GCN-NEXT: v_readfirstlane_b32 s1, v0
36692 ; GCN-NEXT: ; return to shader part epilog
36694 ; GFX7-LABEL: s_select_v4bf16:
36696 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
36697 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36698 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
36699 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
36700 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
36701 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36702 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4
36703 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
36704 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
36705 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36706 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
36707 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
36708 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
36709 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
36710 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6
36711 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
36712 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36713 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
36714 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36715 ; GFX7-NEXT: v_readfirstlane_b32 s0, v1
36716 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0
36717 ; GFX7-NEXT: ; return to shader part epilog
36719 ; GFX8-LABEL: s_select_v4bf16:
36721 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
36722 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
36723 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36724 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36725 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
36726 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
36727 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36728 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1
36729 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0
36730 ; GFX8-NEXT: ; return to shader part epilog
36732 ; GFX9-LABEL: s_select_v4bf16:
36734 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
36735 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
36736 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36737 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36738 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
36739 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
36740 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36741 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
36742 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0
36743 ; GFX9-NEXT: ; return to shader part epilog
36745 ; GFX10-LABEL: s_select_v4bf16:
36747 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
36748 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
36749 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36750 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
36751 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo
36752 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0
36753 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
36754 ; GFX10-NEXT: ; return to shader part epilog
36756 ; GFX11-LABEL: s_select_v4bf16:
36758 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
36759 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36760 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
36761 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
36762 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo
36763 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36764 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
36765 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
36766 ; GFX11-NEXT: ; return to shader part epilog
36767 %cond = icmp eq i32 %c, 0
36768 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
36769 %cast = bitcast <4 x bfloat> %op to <2 x i32>
36770 %elt0 = extractelement <2 x i32> %cast, i32 0
36771 %elt1 = extractelement <2 x i32> %cast, i32 1
36772 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36773 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36774 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36775 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36776 ret <2 x i32> %bv.1
36779 define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
36780 ; GCN-LABEL: s_vselect_v4bf16:
36782 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0
36783 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4
36784 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1
36785 ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5
36786 ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2
36787 ; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6
36788 ; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3
36789 ; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7
36790 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36791 ; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
36792 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36793 ; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
36794 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36795 ; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
36796 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36797 ; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
36798 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36799 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36800 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36801 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
36802 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3
36803 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
36804 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
36805 ; GCN-NEXT: v_readfirstlane_b32 s1, v2
36806 ; GCN-NEXT: ; return to shader part epilog
36808 ; GFX7-LABEL: s_vselect_v4bf16:
36810 ; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3
36811 ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7
36812 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36813 ; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2
36814 ; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6
36815 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
36816 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36817 ; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1
36818 ; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5
36819 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
36820 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36821 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
36822 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4
36823 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
36824 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36825 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
36826 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36827 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36828 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36829 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
36830 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
36831 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
36832 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
36833 ; GFX7-NEXT: v_readfirstlane_b32 s1, v2
36834 ; GFX7-NEXT: ; return to shader part epilog
36836 ; GFX8-LABEL: s_vselect_v4bf16:
36838 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16
36839 ; GFX8-NEXT: s_lshr_b32 s5, s3, 16
36840 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
36841 ; GFX8-NEXT: v_mov_b32_e32 v5, s4
36842 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36843 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
36844 ; GFX8-NEXT: v_mov_b32_e32 v4, s3
36845 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
36846 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36847 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
36848 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
36849 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
36850 ; GFX8-NEXT: s_lshr_b32 s3, s2, 16
36851 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36852 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
36853 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
36854 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36855 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
36856 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
36857 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
36858 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36859 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
36860 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
36861 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36862 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
36863 ; GFX8-NEXT: v_readfirstlane_b32 s1, v2
36864 ; GFX8-NEXT: ; return to shader part epilog
36866 ; GFX9-LABEL: s_vselect_v4bf16:
36868 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16
36869 ; GFX9-NEXT: s_lshr_b32 s5, s3, 16
36870 ; GFX9-NEXT: v_mov_b32_e32 v4, s5
36871 ; GFX9-NEXT: v_mov_b32_e32 v5, s4
36872 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36873 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
36874 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
36875 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
36876 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36877 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
36878 ; GFX9-NEXT: s_mov_b32 s1, 0x5040100
36879 ; GFX9-NEXT: s_lshr_b32 s3, s0, 16
36880 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16
36881 ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1
36882 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
36883 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
36884 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36885 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
36886 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
36887 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
36888 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36889 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
36890 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1
36891 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
36892 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
36893 ; GFX9-NEXT: ; return to shader part epilog
36895 ; GFX10-LABEL: s_vselect_v4bf16:
36897 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
36898 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
36899 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
36900 ; GFX10-NEXT: s_lshr_b32 s4, s3, 16
36901 ; GFX10-NEXT: s_lshr_b32 s5, s0, 16
36902 ; GFX10-NEXT: v_mov_b32_e32 v6, s0
36903 ; GFX10-NEXT: s_lshr_b32 s0, s2, 16
36904 ; GFX10-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo
36905 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
36906 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
36907 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
36908 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo
36909 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36910 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo
36911 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
36912 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
36913 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo
36914 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
36915 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
36916 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
36917 ; GFX10-NEXT: ; return to shader part epilog
36919 ; GFX11TRUE16-LABEL: s_vselect_v4bf16:
36920 ; GFX11TRUE16: ; %bb.0:
36921 ; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
36922 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36923 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
36924 ; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
36925 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
36926 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
36927 ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
36928 ; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
36929 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
36930 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
36931 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
36932 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
36933 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
36934 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
36935 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
36936 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
36937 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s6
36938 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v2.l, s4
36939 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
36940 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo
36941 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s5
36942 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
36943 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
36944 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
36945 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
36946 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
36947 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
36948 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36949 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
36950 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1
36951 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
36952 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
36953 ; GFX11TRUE16-NEXT: ; return to shader part epilog
36955 ; GFX11FAKE16-LABEL: s_vselect_v4bf16:
36956 ; GFX11FAKE16: ; %bb.0:
36957 ; GFX11FAKE16-NEXT: s_lshr_b32 s4, s1, 16
36958 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
36959 ; GFX11FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
36960 ; GFX11FAKE16-NEXT: s_lshr_b32 s4, s3, 16
36961 ; GFX11FAKE16-NEXT: s_lshr_b32 s5, s0, 16
36962 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
36963 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo
36964 ; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, s5
36965 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
36966 ; GFX11FAKE16-NEXT: v_mov_b32_e32 v6, s0
36967 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16
36968 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
36969 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo
36970 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36971 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
36972 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo
36973 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
36974 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
36975 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo
36976 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36977 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
36978 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
36979 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
36980 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1
36981 ; GFX11FAKE16-NEXT: ; return to shader part epilog
36982 %cond = icmp eq <4 x i32> %c, zeroinitializer
36983 %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
36984 %cast = bitcast <4 x bfloat> %op to <2 x i32>
36985 %elt0 = extractelement <2 x i32> %cast, i32 0
36986 %elt1 = extractelement <2 x i32> %cast, i32 1
36987 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36988 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36989 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36990 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36991 ret <2 x i32> %bv.1
36994 define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
36995 ; GCN-LABEL: v_vselect_v4bf16:
36997 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36998 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
36999 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37000 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
37001 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
37002 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
37003 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
37004 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
37005 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
37006 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2
37007 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37008 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
37009 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3
37010 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37011 ; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
37012 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37013 ; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
37014 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37015 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
37016 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37017 ; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
37018 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37019 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37020 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37021 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37022 ; GCN-NEXT: s_setpc_b64 s[30:31]
37024 ; GFX7-LABEL: v_vselect_v4bf16:
37026 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37027 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
37028 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
37029 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
37030 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
37031 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37032 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
37033 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
37034 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
37035 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
37036 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37037 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
37038 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
37039 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
37040 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
37041 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37042 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
37043 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37044 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
37045 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37046 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
37047 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37048 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37049 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37050 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37051 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37053 ; GFX8-LABEL: v_vselect_v4bf16:
37055 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37056 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
37057 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
37058 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v5
37059 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
37060 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37061 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
37062 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
37063 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37064 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
37065 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
37066 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
37067 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
37068 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37069 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
37070 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37071 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
37072 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
37073 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37074 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
37075 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37076 ; GFX8-NEXT: s_setpc_b64 s[30:31]
37078 ; GFX9-LABEL: v_vselect_v4bf16:
37080 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37081 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
37082 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
37083 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37084 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
37085 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
37086 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37087 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37088 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37089 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
37090 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
37091 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37092 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
37093 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
37094 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6
37095 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37096 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
37097 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
37098 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
37099 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
37100 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37102 ; GFX10-LABEL: v_vselect_v4bf16:
37104 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37105 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
37106 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
37107 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
37108 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
37109 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4
37110 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37111 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
37112 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
37113 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37114 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37115 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37116 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
37117 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37118 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
37119 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37120 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37121 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37122 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37123 ; GFX10-NEXT: s_setpc_b64 s[30:31]
37125 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
37126 ; GFX11TRUE16: ; %bb.0:
37127 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37128 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37129 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
37130 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
37131 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
37132 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
37133 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
37134 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v2
37135 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37136 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
37137 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
37138 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
37139 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
37140 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
37141 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
37142 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
37143 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
37144 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v8.l, v3.l, s1
37145 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
37146 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v5.l, s2
37147 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
37148 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
37149 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
37150 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
37151 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
37152 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37153 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
37154 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
37155 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37157 ; GFX11FAKE16-LABEL: v_vselect_v4bf16:
37158 ; GFX11FAKE16: ; %bb.0:
37159 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37160 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
37161 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
37162 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
37163 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
37164 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37165 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
37166 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37167 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37168 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
37169 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37170 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
37171 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37172 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37173 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
37174 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37175 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37176 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37177 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37178 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37179 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
37180 %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
37181 ret <4 x bfloat> %op
37184 define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
37185 ; GCN-LABEL: v_vselect_v8bf16:
37187 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37188 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7
37189 ; GCN-NEXT: v_and_b32_e32 v6, 1, v6
37190 ; GCN-NEXT: v_and_b32_e32 v5, 1, v5
37191 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4
37192 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3
37193 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2
37194 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
37195 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
37196 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
37197 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
37198 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
37199 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
37200 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
37201 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
37202 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37203 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
37204 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
37205 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
37206 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
37207 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
37208 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
37209 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
37210 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
37211 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
37212 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37213 ; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
37214 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37215 ; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
37216 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37217 ; GCN-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
37218 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37219 ; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
37220 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37221 ; GCN-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
37222 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37223 ; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
37224 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37225 ; GCN-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
37226 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37227 ; GCN-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
37228 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37229 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37230 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37231 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37232 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
37233 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
37234 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
37235 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37236 ; GCN-NEXT: s_setpc_b64 s[30:31]
37238 ; GFX7-LABEL: v_vselect_v8bf16:
37240 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37241 ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
37242 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
37243 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
37244 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37245 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
37246 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
37247 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
37248 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
37249 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37250 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
37251 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc
37252 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
37253 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
37254 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37255 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
37256 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc
37257 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
37258 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
37259 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37260 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
37261 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc
37262 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
37263 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
37264 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37265 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
37266 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc
37267 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
37268 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
37269 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18
37270 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37271 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
37272 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
37273 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17
37274 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc
37275 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37276 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37277 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16
37278 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc
37279 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37280 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc
37281 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37282 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37283 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37284 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37285 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
37286 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
37287 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
37288 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37289 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37291 ; GFX8-LABEL: v_vselect_v8bf16:
37293 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37294 ; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
37295 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
37296 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v11
37297 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
37298 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37299 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
37300 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc
37301 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37302 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
37303 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
37304 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
37305 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14
37306 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37307 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
37308 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v11, vcc
37309 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37310 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
37311 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
37312 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
37313 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v13
37314 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37315 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
37316 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
37317 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37318 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
37319 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
37320 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8
37321 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v12
37322 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37323 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc
37324 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37325 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
37326 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
37327 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37328 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
37329 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37330 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
37331 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7
37332 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37333 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37334 ; GFX8-NEXT: s_setpc_b64 s[30:31]
37336 ; GFX9-LABEL: v_vselect_v8bf16:
37338 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37339 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
37340 ; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
37341 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37342 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
37343 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
37344 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37345 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37346 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37347 ; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
37348 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc
37349 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37350 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
37351 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
37352 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
37353 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14
37354 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37355 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
37356 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc
37357 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37358 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
37359 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
37360 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37361 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13
37362 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37363 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
37364 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
37365 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37366 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
37367 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
37368 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12
37369 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37370 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
37371 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
37372 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
37373 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
37374 ; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
37375 ; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4
37376 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37378 ; GFX10-LABEL: v_vselect_v8bf16:
37380 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37381 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
37382 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
37383 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
37384 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
37385 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10
37386 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
37387 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14
37388 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
37389 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
37390 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
37391 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
37392 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
37393 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
37394 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37395 ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37396 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
37397 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
37398 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
37399 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12
37400 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
37401 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37402 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
37403 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37404 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37405 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
37406 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37407 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37408 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
37409 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37410 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37411 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37412 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
37413 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37414 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37415 ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
37416 ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
37417 ; GFX10-NEXT: s_setpc_b64 s[30:31]
37419 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
37420 ; GFX11TRUE16: ; %bb.0:
37421 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37422 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
37423 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37424 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
37425 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
37426 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
37427 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37428 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
37429 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v7
37430 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v6
37431 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
37432 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5
37433 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v15
37434 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
37435 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1
37436 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v11
37437 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
37438 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v2
37439 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v3
37440 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8
37441 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s2
37442 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12
37443 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v9
37444 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v13
37445 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v10
37446 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v14
37447 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v15.l, v11.l, s3
37448 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v14.l, v10.l, s4
37449 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo
37450 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v12.l, v8.l, s0
37451 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v5.l, v4.l, s1
37452 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v13.l, v9.l, s5
37453 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, v6.l, s6
37454 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
37455 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
37456 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
37457 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
37458 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
37459 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
37460 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.h
37461 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
37462 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v4, v5, 0x5040100
37463 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v6, 0x5040100
37464 ; GFX11TRUE16-NEXT: v_perm_b32 v2, v3, v7, 0x5040100
37465 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
37466 ; GFX11TRUE16-NEXT: v_perm_b32 v3, v8, v9, 0x5040100
37467 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37469 ; GFX11FAKE16-LABEL: v_vselect_v8bf16:
37470 ; GFX11FAKE16: ; %bb.0:
37471 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37472 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v10
37473 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14
37474 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
37475 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
37476 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
37477 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
37478 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
37479 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
37480 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37481 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37482 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
37483 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
37484 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
37485 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
37486 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
37487 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12
37488 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
37489 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
37490 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37491 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
37492 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
37493 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37494 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37495 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
37496 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37497 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37498 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
37499 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37500 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37501 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37502 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37503 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
37504 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37505 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37506 ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
37507 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
37508 ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
37509 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
37510 %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
37511 ret <8 x bfloat> %op
37514 define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
37515 ; GCN-LABEL: v_vselect_v16bf16:
37517 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37518 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
37519 ; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
37520 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
37521 ; GCN-NEXT: s_waitcnt expcnt(0)
37522 ; GCN-NEXT: v_writelane_b32 v31, s30, 0
37523 ; GCN-NEXT: v_writelane_b32 v31, s31, 1
37524 ; GCN-NEXT: v_writelane_b32 v31, s34, 2
37525 ; GCN-NEXT: v_writelane_b32 v31, s35, 3
37526 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
37527 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37528 ; GCN-NEXT: v_and_b32_e32 v0, 1, v1
37529 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
37530 ; GCN-NEXT: v_and_b32_e32 v0, 1, v2
37531 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
37532 ; GCN-NEXT: v_and_b32_e32 v0, 1, v3
37533 ; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
37534 ; GCN-NEXT: v_and_b32_e32 v0, 1, v4
37535 ; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
37536 ; GCN-NEXT: v_and_b32_e32 v0, 1, v5
37537 ; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
37538 ; GCN-NEXT: v_and_b32_e32 v0, 1, v6
37539 ; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
37540 ; GCN-NEXT: v_and_b32_e32 v0, 1, v7
37541 ; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
37542 ; GCN-NEXT: v_and_b32_e32 v0, 1, v8
37543 ; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
37544 ; GCN-NEXT: v_and_b32_e32 v0, 1, v9
37545 ; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
37546 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
37547 ; GCN-NEXT: v_and_b32_e32 v1, 1, v10
37548 ; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
37549 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
37550 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
37551 ; GCN-NEXT: v_and_b32_e32 v2, 1, v11
37552 ; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2
37553 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
37554 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
37555 ; GCN-NEXT: v_and_b32_e32 v3, 1, v12
37556 ; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3
37557 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
37558 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19
37559 ; GCN-NEXT: v_and_b32_e32 v7, 1, v13
37560 ; GCN-NEXT: v_and_b32_e32 v8, 1, v14
37561 ; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
37562 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32
37563 ; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8
37564 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
37565 ; GCN-NEXT: v_and_b32_e32 v9, 1, v15
37566 ; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9
37567 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60
37568 ; GCN-NEXT: s_waitcnt vmcnt(2)
37569 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37570 ; GCN-NEXT: s_waitcnt vmcnt(1)
37571 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37572 ; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35]
37573 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56
37574 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
37575 ; GCN-NEXT: s_waitcnt vmcnt(1)
37576 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
37577 ; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31]
37578 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
37579 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29
37580 ; GCN-NEXT: s_waitcnt vmcnt(1)
37581 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37582 ; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29]
37583 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
37584 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28
37585 ; GCN-NEXT: s_waitcnt vmcnt(1)
37586 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37587 ; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27]
37588 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
37589 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27
37590 ; GCN-NEXT: s_waitcnt vmcnt(1)
37591 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37592 ; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25]
37593 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40
37594 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
37595 ; GCN-NEXT: s_waitcnt vmcnt(1)
37596 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37597 ; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23]
37598 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
37599 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25
37600 ; GCN-NEXT: s_waitcnt vmcnt(1)
37601 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37602 ; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21]
37603 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
37604 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24
37605 ; GCN-NEXT: s_waitcnt vmcnt(1)
37606 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37607 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19]
37608 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28
37609 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
37610 ; GCN-NEXT: s_waitcnt vmcnt(1)
37611 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37612 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17]
37613 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
37614 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
37615 ; GCN-NEXT: s_waitcnt vmcnt(1)
37616 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
37617 ; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15]
37618 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
37619 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
37620 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
37621 ; GCN-NEXT: s_waitcnt vmcnt(1)
37622 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
37623 ; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
37624 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
37625 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
37626 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
37627 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
37628 ; GCN-NEXT: s_waitcnt vmcnt(1)
37629 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
37630 ; GCN-NEXT: s_waitcnt vmcnt(0)
37631 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
37632 ; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11]
37633 ; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9]
37634 ; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7]
37635 ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
37636 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
37637 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37638 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37639 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37640 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37641 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
37642 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
37643 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
37644 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37645 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
37646 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
37647 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
37648 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
37649 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
37650 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
37651 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
37652 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
37653 ; GCN-NEXT: v_readlane_b32 s35, v31, 3
37654 ; GCN-NEXT: v_readlane_b32 s34, v31, 2
37655 ; GCN-NEXT: v_readlane_b32 s31, v31, 1
37656 ; GCN-NEXT: v_readlane_b32 s30, v31, 0
37657 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
37658 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
37659 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
37660 ; GCN-NEXT: s_waitcnt vmcnt(0)
37661 ; GCN-NEXT: s_setpc_b64 s[30:31]
37663 ; GFX7-LABEL: v_vselect_v16bf16:
37665 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37666 ; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
37667 ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
37668 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8
37669 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7
37670 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32
37671 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
37672 ; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
37673 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15
37674 ; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
37675 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14
37676 ; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
37677 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13
37678 ; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
37679 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12
37680 ; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
37681 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11
37682 ; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
37683 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
37684 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
37685 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
37686 ; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
37687 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9
37688 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
37689 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
37690 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
37691 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
37692 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
37693 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
37694 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
37695 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
37696 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
37697 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
37698 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37699 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
37700 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37701 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37702 ; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13]
37703 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60
37704 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30
37705 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
37706 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37707 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37708 ; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11]
37709 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
37710 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29
37711 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
37712 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37713 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37714 ; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9]
37715 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
37716 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28
37717 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
37718 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37719 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37720 ; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7]
37721 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
37722 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27
37723 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
37724 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37725 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37726 ; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5]
37727 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
37728 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26
37729 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
37730 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37731 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37732 ; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc
37733 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37734 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
37735 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
37736 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
37737 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25
37738 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
37739 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37740 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
37741 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
37742 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37743 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
37744 ; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
37745 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37746 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37747 ; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19]
37748 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
37749 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24
37750 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
37751 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
37752 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37753 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
37754 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
37755 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37756 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
37757 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37758 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37759 ; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17]
37760 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23
37761 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
37762 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
37763 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
37764 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37765 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
37766 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
37767 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
37768 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37769 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
37770 ; GFX7-NEXT: s_waitcnt vmcnt(2)
37771 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
37772 ; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15]
37773 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
37774 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37775 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37776 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
37777 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
37778 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
37779 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37780 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
37781 ; GFX7-NEXT: s_waitcnt vmcnt(2)
37782 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
37783 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc
37784 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37785 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37786 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20
37787 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37788 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
37789 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
37790 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37791 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc
37792 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37793 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37794 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
37795 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
37796 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37798 ; GFX8-LABEL: v_vselect_v16bf16:
37800 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37801 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
37802 ; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
37803 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
37804 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
37805 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37806 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
37807 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
37808 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
37809 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
37810 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v3
37811 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
37812 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
37813 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
37814 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v5
37815 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
37816 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
37817 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
37818 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v7
37819 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
37820 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v8
37821 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
37822 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v9
37823 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
37824 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
37825 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
37826 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
37827 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
37828 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
37829 ; GFX8-NEXT: v_writelane_b32 v31, s30, 0
37830 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
37831 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
37832 ; GFX8-NEXT: v_writelane_b32 v31, s31, 1
37833 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
37834 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
37835 ; GFX8-NEXT: v_writelane_b32 v31, s34, 2
37836 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
37837 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
37838 ; GFX8-NEXT: v_writelane_b32 v31, s35, 3
37839 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
37840 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
37841 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30
37842 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29]
37843 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21
37844 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
37845 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25]
37846 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
37847 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28
37848 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21]
37849 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
37850 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
37851 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24
37852 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27]
37853 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
37854 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23]
37855 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19]
37856 ; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
37857 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11]
37858 ; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7]
37859 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
37860 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
37861 ; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37862 ; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37863 ; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37864 ; GFX8-NEXT: s_waitcnt vmcnt(0)
37865 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31]
37866 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
37867 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35]
37868 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19
37869 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
37870 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17]
37871 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
37872 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
37873 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
37874 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17
37875 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25
37876 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9]
37877 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16
37878 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5]
37879 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc
37880 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
37881 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
37882 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
37883 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
37884 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11
37885 ; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37886 ; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37887 ; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37888 ; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37889 ; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37890 ; GFX8-NEXT: v_readlane_b32 s35, v31, 3
37891 ; GFX8-NEXT: v_readlane_b32 s34, v31, 2
37892 ; GFX8-NEXT: v_readlane_b32 s31, v31, 1
37893 ; GFX8-NEXT: v_readlane_b32 s30, v31, 0
37894 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
37895 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
37896 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
37897 ; GFX8-NEXT: s_waitcnt vmcnt(0)
37898 ; GFX8-NEXT: s_setpc_b64 s[30:31]
37900 ; GFX9-LABEL: v_vselect_v16bf16:
37902 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37903 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
37904 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
37905 ; GFX9-NEXT: v_and_b32_e32 v13, 1, v13
37906 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc
37907 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
37908 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
37909 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
37910 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
37911 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc
37912 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
37913 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v11
37914 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc
37915 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
37916 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29
37917 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
37918 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc
37919 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32
37920 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
37921 ; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
37922 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
37923 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
37924 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc
37925 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28
37926 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
37927 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
37928 ; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
37929 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc
37930 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37931 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19
37932 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27
37933 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
37934 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
37935 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37936 ; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
37937 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc
37938 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37939 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18
37940 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
37941 ; GFX9-NEXT: v_and_b32_e32 v14, 1, v14
37942 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
37943 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37944 ; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
37945 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc
37946 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
37947 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
37948 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23
37949 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
37950 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
37951 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
37952 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
37953 ; GFX9-NEXT: s_waitcnt vmcnt(0)
37954 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc
37955 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21
37956 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
37957 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc
37958 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37959 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
37960 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17
37961 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
37962 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37963 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc
37964 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37965 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
37966 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16
37967 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
37968 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37969 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc
37970 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
37971 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
37972 ; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
37973 ; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4
37974 ; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4
37975 ; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4
37976 ; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4
37977 ; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4
37978 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37980 ; GFX10-LABEL: v_vselect_v16bf16:
37982 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37983 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
37984 ; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
37985 ; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
37986 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
37987 ; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v22
37988 ; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v30
37989 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
37990 ; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
37991 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
37992 ; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21
37993 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29
37994 ; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo
37995 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
37996 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
37997 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
37998 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20
37999 ; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28
38000 ; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
38001 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
38002 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
38003 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
38004 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
38005 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
38006 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
38007 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
38008 ; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17
38009 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25
38010 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
38011 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
38012 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
38013 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
38014 ; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16
38015 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24
38016 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
38017 ; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18
38018 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
38019 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
38020 ; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v26
38021 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
38022 ; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v19
38023 ; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v27
38024 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo
38025 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
38026 ; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
38027 ; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v23
38028 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
38029 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
38030 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo
38031 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
38032 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
38033 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
38034 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo
38035 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
38036 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
38037 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38038 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo
38039 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
38040 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
38041 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
38042 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
38043 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
38044 ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
38045 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo
38046 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
38047 ; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
38048 ; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
38049 ; GFX10-NEXT: s_waitcnt vmcnt(0)
38050 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31
38051 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo
38052 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
38053 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo
38054 ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
38055 ; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100
38056 ; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100
38057 ; GFX10-NEXT: s_setpc_b64 s[30:31]
38059 ; GFX11TRUE16-LABEL: v_vselect_v16bf16:
38060 ; GFX11TRUE16: ; %bb.0:
38061 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38062 ; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
38063 ; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
38064 ; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
38065 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
38066 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
38067 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
38068 ; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
38069 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
38070 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
38071 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
38072 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
38073 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
38074 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
38075 ; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
38076 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
38077 ; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
38078 ; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
38079 ; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
38080 ; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
38081 ; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
38082 ; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
38083 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
38084 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
38085 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
38086 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
38087 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38088 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
38089 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
38090 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
38091 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
38092 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v28.l, v20.l, s8
38093 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v38.l, v37.l, s7
38094 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
38095 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
38096 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
38097 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
38098 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
38099 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
38100 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26
38101 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
38102 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25
38103 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
38104 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
38105 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
38106 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
38107 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12
38108 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
38109 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10
38110 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
38111 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
38112 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v27.l, v19.l, s6
38113 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v48.l, v39.l, s5
38114 ; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo
38115 ; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v24.l, v16.l, s0
38116 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.h
38117 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
38118 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v30.l, v22.l, s10
38119 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v33.l, s11
38120 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v29.l, v21.l, s12
38121 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v36.l, v35.l, s9
38122 ; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v52.l, v51.l, s1
38123 ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v25.l, v17.l, s2
38124 ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v50.l, v49.l, s3
38125 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
38126 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
38127 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h
38128 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
38129 ; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v26.l, v18.l, s4
38130 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
38131 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
38132 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
38133 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h
38134 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.l
38135 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v0.h
38136 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
38137 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v7, v8, 0x5040100
38138 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v5, v9, 0x5040100
38139 ; GFX11TRUE16-NEXT: v_perm_b32 v5, v14, v15, 0x5040100
38140 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
38141 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v31
38142 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v31.l, v23.l, s14
38143 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
38144 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, v32.l, s13
38145 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l
38146 ; GFX11TRUE16-NEXT: v_perm_b32 v2, v6, v4, 0x5040100
38147 ; GFX11TRUE16-NEXT: v_perm_b32 v4, v12, v13, 0x5040100
38148 ; GFX11TRUE16-NEXT: v_perm_b32 v6, v16, v17, 0x5040100
38149 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
38150 ; GFX11TRUE16-NEXT: v_perm_b32 v3, v10, v11, 0x5040100
38151 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
38152 ; GFX11TRUE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
38153 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
38155 ; GFX11FAKE16-LABEL: v_vselect_v16bf16:
38156 ; GFX11FAKE16: ; %bb.0:
38157 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38158 ; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32
38159 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
38160 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
38161 ; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
38162 ; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
38163 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
38164 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
38165 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
38166 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
38167 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
38168 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26
38169 ; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
38170 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
38171 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
38172 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
38173 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
38174 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
38175 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
38176 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
38177 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
38178 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
38179 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
38180 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
38181 ; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
38182 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
38183 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
38184 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
38185 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
38186 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
38187 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
38188 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
38189 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
38190 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
38191 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25
38192 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
38193 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
38194 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
38195 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
38196 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
38197 ; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
38198 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo
38199 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
38200 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
38201 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
38202 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
38203 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
38204 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
38205 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
38206 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
38207 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
38208 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
38209 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38210 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
38211 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
38212 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
38213 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
38214 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
38215 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
38216 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
38217 ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
38218 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo
38219 ; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
38220 ; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
38221 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
38222 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v31
38223 ; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
38224 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38225 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
38226 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
38227 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
38228 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
38229 ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
38230 ; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
38231 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
38232 ; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
38233 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
38234 %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
38235 ret <16 x bfloat> %op
38238 define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
38239 ; GCN-LABEL: v_vselect_v32bf16:
38241 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38242 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
38243 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
38244 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
38245 ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
38246 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
38247 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
38248 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
38249 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
38250 ; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
38251 ; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
38252 ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
38253 ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
38254 ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
38255 ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
38256 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
38257 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
38258 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2
38259 ; GCN-NEXT: v_and_b32_e32 v36, 1, v13
38260 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
38261 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
38262 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
38263 ; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184
38264 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
38265 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188
38266 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
38267 ; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192
38268 ; GCN-NEXT: v_and_b32_e32 v53, 1, v26
38269 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84
38270 ; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88
38271 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92
38272 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96
38273 ; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100
38274 ; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
38275 ; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
38276 ; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112
38277 ; GCN-NEXT: v_and_b32_e32 v27, 1, v27
38278 ; GCN-NEXT: v_and_b32_e32 v28, 1, v28
38279 ; GCN-NEXT: v_and_b32_e32 v29, 1, v29
38280 ; GCN-NEXT: v_and_b32_e32 v30, 1, v30
38281 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116
38282 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120
38283 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124
38284 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
38285 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252
38286 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248
38287 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244
38288 ; GCN-NEXT: s_waitcnt expcnt(6)
38289 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240
38290 ; GCN-NEXT: s_waitcnt vmcnt(14)
38291 ; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37
38292 ; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38
38293 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36
38294 ; GCN-NEXT: s_waitcnt vmcnt(5)
38295 ; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43
38296 ; GCN-NEXT: s_waitcnt vmcnt(3)
38297 ; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44
38298 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30
38299 ; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5]
38300 ; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236
38301 ; GCN-NEXT: s_waitcnt expcnt(5)
38302 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:232
38303 ; GCN-NEXT: s_waitcnt expcnt(4)
38304 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228
38305 ; GCN-NEXT: s_waitcnt expcnt(3)
38306 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:224
38307 ; GCN-NEXT: s_waitcnt expcnt(2)
38308 ; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:220
38309 ; GCN-NEXT: s_waitcnt expcnt(1)
38310 ; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:216
38311 ; GCN-NEXT: s_waitcnt expcnt(0)
38312 ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212
38313 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128
38314 ; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
38315 ; GCN-NEXT: s_waitcnt vmcnt(10)
38316 ; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45
38317 ; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
38318 ; GCN-NEXT: s_waitcnt vmcnt(9)
38319 ; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46
38320 ; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
38321 ; GCN-NEXT: s_waitcnt vmcnt(8)
38322 ; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47
38323 ; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
38324 ; GCN-NEXT: s_waitcnt vmcnt(7)
38325 ; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36
38326 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29
38327 ; GCN-NEXT: v_cndmask_b32_e64 v29, v43, v42, s[4:5]
38328 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
38329 ; GCN-NEXT: v_cndmask_b32_e64 v28, v44, v41, s[4:5]
38330 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27
38331 ; GCN-NEXT: v_cndmask_b32_e64 v27, v45, v55, s[4:5]
38332 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v53
38333 ; GCN-NEXT: v_cndmask_b32_e64 v36, v36, v54, s[4:5]
38334 ; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4
38335 ; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132
38336 ; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8
38337 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136
38338 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
38339 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140
38340 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16
38341 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144
38342 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3
38343 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4
38344 ; GCN-NEXT: v_and_b32_e32 v5, 1, v5
38345 ; GCN-NEXT: v_and_b32_e32 v6, 1, v6
38346 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18
38347 ; GCN-NEXT: v_and_b32_e32 v22, 1, v22
38348 ; GCN-NEXT: v_and_b32_e32 v23, 1, v23
38349 ; GCN-NEXT: v_and_b32_e32 v24, 1, v24
38350 ; GCN-NEXT: v_and_b32_e32 v25, 1, v25
38351 ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
38352 ; GCN-NEXT: s_waitcnt vmcnt(14)
38353 ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56
38354 ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
38355 ; GCN-NEXT: s_waitcnt vmcnt(13)
38356 ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57
38357 ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
38358 ; GCN-NEXT: s_waitcnt vmcnt(12)
38359 ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58
38360 ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
38361 ; GCN-NEXT: s_waitcnt vmcnt(11)
38362 ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59
38363 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25
38364 ; GCN-NEXT: v_cndmask_b32_e64 v25, v46, v52, s[4:5]
38365 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
38366 ; GCN-NEXT: v_cndmask_b32_e64 v24, v47, v51, s[4:5]
38367 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23
38368 ; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5]
38369 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22
38370 ; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5]
38371 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
38372 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196
38373 ; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
38374 ; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200
38375 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76
38376 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204
38377 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80
38378 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208
38379 ; GCN-NEXT: v_and_b32_e32 v19, 1, v19
38380 ; GCN-NEXT: v_and_b32_e32 v20, 1, v20
38381 ; GCN-NEXT: v_and_b32_e32 v21, 1, v21
38382 ; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
38383 ; GCN-NEXT: s_waitcnt vmcnt(14)
38384 ; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60
38385 ; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
38386 ; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61
38387 ; GCN-NEXT: s_waitcnt vmcnt(3)
38388 ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
38389 ; GCN-NEXT: s_waitcnt vmcnt(2)
38390 ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
38391 ; GCN-NEXT: s_waitcnt vmcnt(1)
38392 ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
38393 ; GCN-NEXT: s_waitcnt vmcnt(0)
38394 ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
38395 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21
38396 ; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5]
38397 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20
38398 ; GCN-NEXT: v_cndmask_b32_e64 v20, v59, v39, s[4:5]
38399 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19
38400 ; GCN-NEXT: v_cndmask_b32_e64 v19, v57, v56, s[4:5]
38401 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
38402 ; GCN-NEXT: v_cndmask_b32_e64 v18, v47, v46, s[4:5]
38403 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
38404 ; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:148
38405 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24
38406 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152
38407 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28
38408 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156
38409 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
38410 ; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160
38411 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7
38412 ; GCN-NEXT: v_and_b32_e32 v8, 1, v8
38413 ; GCN-NEXT: v_and_b32_e32 v9, 1, v9
38414 ; GCN-NEXT: v_and_b32_e32 v10, 1, v10
38415 ; GCN-NEXT: v_and_b32_e32 v14, 1, v14
38416 ; GCN-NEXT: v_and_b32_e32 v15, 1, v15
38417 ; GCN-NEXT: v_and_b32_e32 v16, 1, v16
38418 ; GCN-NEXT: v_and_b32_e32 v17, 1, v17
38419 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
38420 ; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
38421 ; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
38422 ; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
38423 ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
38424 ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
38425 ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
38426 ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
38427 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17
38428 ; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5]
38429 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
38430 ; GCN-NEXT: v_cndmask_b32_e64 v16, v50, v49, s[4:5]
38431 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15
38432 ; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5]
38433 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
38434 ; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5]
38435 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
38436 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164
38437 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40
38438 ; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168
38439 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44
38440 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172
38441 ; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
38442 ; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176
38443 ; GCN-NEXT: v_and_b32_e32 v11, 1, v11
38444 ; GCN-NEXT: v_and_b32_e32 v12, 1, v12
38445 ; GCN-NEXT: v_cndmask_b32_e32 v38, v38, v40, vcc
38446 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256
38447 ; GCN-NEXT: v_and_b32_e32 v26, 1, v26
38448 ; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53
38449 ; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
38450 ; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
38451 ; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
38452 ; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
38453 ; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43
38454 ; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44
38455 ; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45
38456 ; GCN-NEXT: s_waitcnt vmcnt(14)
38457 ; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
38458 ; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
38459 ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
38460 ; GCN-NEXT: s_waitcnt vmcnt(13)
38461 ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
38462 ; GCN-NEXT: s_waitcnt vmcnt(12)
38463 ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
38464 ; GCN-NEXT: s_waitcnt vmcnt(11)
38465 ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
38466 ; GCN-NEXT: s_waitcnt vmcnt(10)
38467 ; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58
38468 ; GCN-NEXT: s_waitcnt vmcnt(9)
38469 ; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59
38470 ; GCN-NEXT: s_waitcnt vmcnt(8)
38471 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
38472 ; GCN-NEXT: s_waitcnt vmcnt(7)
38473 ; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
38474 ; GCN-NEXT: s_waitcnt vmcnt(6)
38475 ; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
38476 ; GCN-NEXT: s_waitcnt vmcnt(5)
38477 ; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
38478 ; GCN-NEXT: s_waitcnt vmcnt(4)
38479 ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
38480 ; GCN-NEXT: s_waitcnt vmcnt(3)
38481 ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
38482 ; GCN-NEXT: s_waitcnt vmcnt(2)
38483 ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
38484 ; GCN-NEXT: s_waitcnt vmcnt(1)
38485 ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
38486 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
38487 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
38488 ; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37
38489 ; GCN-NEXT: s_waitcnt vmcnt(0)
38490 ; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40
38491 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
38492 ; GCN-NEXT: v_cndmask_b32_e32 v12, v31, v13, vcc
38493 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
38494 ; GCN-NEXT: v_cndmask_b32_e32 v11, v52, v51, vcc
38495 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
38496 ; GCN-NEXT: v_cndmask_b32_e32 v10, v50, v49, vcc
38497 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
38498 ; GCN-NEXT: v_cndmask_b32_e32 v9, v35, v34, vcc
38499 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
38500 ; GCN-NEXT: v_cndmask_b32_e32 v8, v33, v32, vcc
38501 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
38502 ; GCN-NEXT: v_cndmask_b32_e32 v7, v59, v58, vcc
38503 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
38504 ; GCN-NEXT: v_cndmask_b32_e32 v6, v57, v56, vcc
38505 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
38506 ; GCN-NEXT: v_cndmask_b32_e32 v5, v47, v46, vcc
38507 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
38508 ; GCN-NEXT: v_cndmask_b32_e32 v4, v48, v39, vcc
38509 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
38510 ; GCN-NEXT: v_cndmask_b32_e32 v3, v45, v44, vcc
38511 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
38512 ; GCN-NEXT: v_cndmask_b32_e32 v2, v43, v42, vcc
38513 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38514 ; GCN-NEXT: v_cndmask_b32_e32 v1, v41, v55, vcc
38515 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38516 ; GCN-NEXT: v_cndmask_b32_e32 v0, v54, v53, vcc
38517 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
38518 ; GCN-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc
38519 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
38520 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
38521 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
38522 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
38523 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
38524 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
38525 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
38526 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
38527 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
38528 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
38529 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
38530 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
38531 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
38532 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38
38533 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
38534 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
38535 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
38536 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
38537 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
38538 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
38539 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
38540 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
38541 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
38542 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
38543 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
38544 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
38545 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36
38546 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
38547 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
38548 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
38549 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
38550 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
38551 ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
38552 ; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
38553 ; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
38554 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
38555 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
38556 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
38557 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
38558 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
38559 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
38560 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
38561 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
38562 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
38563 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
38564 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
38565 ; GCN-NEXT: s_waitcnt vmcnt(0)
38566 ; GCN-NEXT: s_setpc_b64 s[30:31]
38568 ; GFX7-LABEL: v_vselect_v32bf16:
38570 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38571 ; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
38572 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
38573 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32
38574 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228
38575 ; GFX7-NEXT: v_and_b32_e32 v25, 1, v25
38576 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25
38577 ; GFX7-NEXT: v_and_b32_e32 v30, 1, v30
38578 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30
38579 ; GFX7-NEXT: v_and_b32_e32 v29, 1, v29
38580 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29
38581 ; GFX7-NEXT: v_and_b32_e32 v28, 1, v28
38582 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28
38583 ; GFX7-NEXT: v_and_b32_e32 v27, 1, v27
38584 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27
38585 ; GFX7-NEXT: v_and_b32_e32 v26, 1, v26
38586 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26
38587 ; GFX7-NEXT: v_and_b32_e32 v23, 1, v23
38588 ; GFX7-NEXT: v_and_b32_e32 v22, 1, v22
38589 ; GFX7-NEXT: v_and_b32_e32 v21, 1, v21
38590 ; GFX7-NEXT: v_and_b32_e32 v20, 1, v20
38591 ; GFX7-NEXT: v_and_b32_e32 v19, 1, v19
38592 ; GFX7-NEXT: v_and_b32_e32 v18, 1, v18
38593 ; GFX7-NEXT: v_and_b32_e32 v17, 1, v17
38594 ; GFX7-NEXT: v_and_b32_e32 v16, 1, v16
38595 ; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
38596 ; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
38597 ; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
38598 ; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
38599 ; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
38600 ; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
38601 ; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
38602 ; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
38603 ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
38604 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
38605 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
38606 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
38607 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
38608 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
38609 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
38610 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
38611 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252
38612 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
38613 ; GFX7-NEXT: s_waitcnt vmcnt(3)
38614 ; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
38615 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
38616 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124
38617 ; GFX7-NEXT: s_waitcnt vmcnt(3)
38618 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38619 ; GFX7-NEXT: s_waitcnt vmcnt(2)
38620 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38621 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38622 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
38623 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38624 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38625 ; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13]
38626 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120
38627 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248
38628 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
38629 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38630 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38631 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38632 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38633 ; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15]
38634 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
38635 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244
38636 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
38637 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38638 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38639 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38640 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38641 ; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17]
38642 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112
38643 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240
38644 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
38645 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38646 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38647 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38648 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38649 ; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11]
38650 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108
38651 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236
38652 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
38653 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38654 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38655 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38656 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38657 ; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9]
38658 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104
38659 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232
38660 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
38661 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38662 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38663 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38664 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38665 ; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7]
38666 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128
38667 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
38668 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38669 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38670 ; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5]
38671 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
38672 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
38673 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38674 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38675 ; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc
38676 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23
38677 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
38678 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
38679 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
38680 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38681 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
38682 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38683 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38684 ; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc
38685 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
38686 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
38687 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220
38688 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
38689 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38690 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
38691 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38692 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38693 ; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc
38694 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
38695 ; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88
38696 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216
38697 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
38698 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38699 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
38700 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38701 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38702 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc
38703 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
38704 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
38705 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212
38706 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
38707 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38708 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
38709 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38710 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38711 ; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc
38712 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
38713 ; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
38714 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208
38715 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
38716 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38717 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
38718 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38719 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38720 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc
38721 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
38722 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
38723 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204
38724 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
38725 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38726 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
38727 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38728 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38729 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc
38730 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
38731 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
38732 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200
38733 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
38734 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38735 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
38736 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38737 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38738 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc
38739 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
38740 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
38741 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196
38742 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
38743 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38744 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
38745 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38746 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38747 ; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc
38748 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
38749 ; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
38750 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192
38751 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
38752 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38753 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
38754 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38755 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38756 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc
38757 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
38758 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
38759 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188
38760 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
38761 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38762 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
38763 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38764 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38765 ; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc
38766 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
38767 ; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
38768 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184
38769 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
38770 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38771 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
38772 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38773 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38774 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc
38775 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
38776 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
38777 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180
38778 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
38779 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38780 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
38781 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38782 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38783 ; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc
38784 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
38785 ; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
38786 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176
38787 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
38788 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38789 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
38790 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38791 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38792 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc
38793 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
38794 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
38795 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172
38796 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
38797 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38798 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
38799 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38800 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38801 ; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc
38802 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
38803 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
38804 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168
38805 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
38806 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38807 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
38808 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38809 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38810 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc
38811 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
38812 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
38813 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164
38814 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
38815 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38816 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
38817 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38818 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38819 ; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc
38820 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
38821 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
38822 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160
38823 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
38824 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38825 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
38826 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38827 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38828 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc
38829 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
38830 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
38831 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156
38832 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
38833 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38834 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
38835 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38836 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38837 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc
38838 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
38839 ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
38840 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152
38841 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
38842 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38843 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
38844 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38845 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38846 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc
38847 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
38848 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
38849 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148
38850 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
38851 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38852 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
38853 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38854 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38855 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc
38856 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
38857 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
38858 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
38859 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
38860 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38861 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
38862 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38863 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38864 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc
38865 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
38866 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
38867 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140
38868 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
38869 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38870 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
38871 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38872 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38873 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc
38874 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38875 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
38876 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
38877 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
38878 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38879 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
38880 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38881 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38882 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc
38883 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38884 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
38885 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
38886 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
38887 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38888 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
38889 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38890 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38891 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc
38892 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
38893 ; GFX7-NEXT: s_setpc_b64 s[30:31]
38895 ; GFX8-LABEL: v_vselect_v32bf16:
38897 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38898 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
38899 ; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
38900 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
38901 ; GFX8-NEXT: v_writelane_b32 v34, s30, 0
38902 ; GFX8-NEXT: v_writelane_b32 v34, s31, 1
38903 ; GFX8-NEXT: v_writelane_b32 v34, s34, 2
38904 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
38905 ; GFX8-NEXT: v_writelane_b32 v34, s35, 3
38906 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38907 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
38908 ; GFX8-NEXT: v_writelane_b32 v34, s36, 4
38909 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
38910 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
38911 ; GFX8-NEXT: v_writelane_b32 v34, s37, 5
38912 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
38913 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v3
38914 ; GFX8-NEXT: v_writelane_b32 v34, s38, 6
38915 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
38916 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
38917 ; GFX8-NEXT: v_writelane_b32 v34, s39, 7
38918 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
38919 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v5
38920 ; GFX8-NEXT: v_writelane_b32 v34, s40, 8
38921 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
38922 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
38923 ; GFX8-NEXT: v_writelane_b32 v34, s41, 9
38924 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
38925 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v7
38926 ; GFX8-NEXT: v_writelane_b32 v34, s42, 10
38927 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
38928 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v8
38929 ; GFX8-NEXT: v_writelane_b32 v34, s43, 11
38930 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
38931 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v9
38932 ; GFX8-NEXT: v_writelane_b32 v34, s44, 12
38933 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
38934 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
38935 ; GFX8-NEXT: v_writelane_b32 v34, s45, 13
38936 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
38937 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
38938 ; GFX8-NEXT: v_writelane_b32 v34, s46, 14
38939 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
38940 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
38941 ; GFX8-NEXT: v_writelane_b32 v34, s47, 15
38942 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
38943 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
38944 ; GFX8-NEXT: v_writelane_b32 v34, s48, 16
38945 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
38946 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
38947 ; GFX8-NEXT: v_writelane_b32 v34, s49, 17
38948 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
38949 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
38950 ; GFX8-NEXT: v_writelane_b32 v34, s50, 18
38951 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
38952 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v16
38953 ; GFX8-NEXT: v_writelane_b32 v34, s51, 19
38954 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
38955 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v17
38956 ; GFX8-NEXT: v_writelane_b32 v34, s52, 20
38957 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
38958 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v18
38959 ; GFX8-NEXT: v_writelane_b32 v34, s53, 21
38960 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
38961 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v19
38962 ; GFX8-NEXT: v_writelane_b32 v34, s54, 22
38963 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
38964 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
38965 ; GFX8-NEXT: v_writelane_b32 v34, s55, 23
38966 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
38967 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
38968 ; GFX8-NEXT: v_writelane_b32 v34, s56, 24
38969 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
38970 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
38971 ; GFX8-NEXT: v_writelane_b32 v34, s57, 25
38972 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0
38973 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
38974 ; GFX8-NEXT: v_writelane_b32 v34, s58, 26
38975 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0
38976 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
38977 ; GFX8-NEXT: v_writelane_b32 v34, s59, 27
38978 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0
38979 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
38980 ; GFX8-NEXT: v_writelane_b32 v34, s60, 28
38981 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0
38982 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
38983 ; GFX8-NEXT: v_writelane_b32 v34, s61, 29
38984 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
38985 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
38986 ; GFX8-NEXT: v_writelane_b32 v34, s62, 30
38987 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
38988 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
38989 ; GFX8-NEXT: v_writelane_b32 v34, s63, 31
38990 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
38991 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
38992 ; GFX8-NEXT: v_writelane_b32 v34, s64, 32
38993 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
38994 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v30
38995 ; GFX8-NEXT: v_writelane_b32 v34, s65, 33
38996 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0
38997 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32
38998 ; GFX8-NEXT: v_writelane_b32 v34, s66, 34
38999 ; GFX8-NEXT: v_writelane_b32 v34, s67, 35
39000 ; GFX8-NEXT: s_waitcnt vmcnt(0)
39001 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
39002 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0
39003 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
39004 ; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
39005 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
39006 ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
39007 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
39008 ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
39009 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
39010 ; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
39011 ; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
39012 ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
39013 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
39014 ; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
39015 ; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
39016 ; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
39017 ; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
39018 ; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
39019 ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
39020 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
39021 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
39022 ; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
39023 ; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
39024 ; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
39025 ; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
39026 ; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
39027 ; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
39028 ; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
39029 ; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
39030 ; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
39031 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
39032 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
39033 ; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128
39034 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
39035 ; GFX8-NEXT: s_waitcnt vmcnt(1)
39036 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29
39037 ; GFX8-NEXT: s_waitcnt vmcnt(0)
39038 ; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32
39039 ; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[66:67]
39040 ; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[64:65]
39041 ; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31
39042 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30
39043 ; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[62:63]
39044 ; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[60:61]
39045 ; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27
39046 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26
39047 ; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[58:59]
39048 ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57]
39049 ; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25
39050 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24
39051 ; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[54:55]
39052 ; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53]
39053 ; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23
39054 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22
39055 ; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[50:51]
39056 ; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49]
39057 ; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21
39058 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20
39059 ; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[46:47]
39060 ; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45]
39061 ; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19
39062 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18
39063 ; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[42:43]
39064 ; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41]
39065 ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17
39066 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16
39067 ; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[38:39]
39068 ; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37]
39069 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
39070 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14
39071 ; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[34:35]
39072 ; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31]
39073 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13
39074 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12
39075 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29]
39076 ; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
39077 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11
39078 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10
39079 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25]
39080 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
39081 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9
39082 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8
39083 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21]
39084 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
39085 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
39086 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6
39087 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17]
39088 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
39089 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
39090 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4
39091 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13]
39092 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
39093 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
39094 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2
39095 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9]
39096 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
39097 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
39098 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0
39099 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5]
39100 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
39101 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
39102 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39103 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
39104 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39105 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
39106 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9
39107 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39108 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39109 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11
39110 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13
39111 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15
39112 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17
39113 ; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39114 ; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39115 ; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39116 ; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39117 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19
39118 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21
39119 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23
39120 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v25
39121 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v27
39122 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31
39123 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32
39124 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28
39125 ; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39126 ; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39127 ; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39128 ; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39129 ; GFX8-NEXT: v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39130 ; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39131 ; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39132 ; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39133 ; GFX8-NEXT: v_readlane_b32 s67, v34, 35
39134 ; GFX8-NEXT: v_readlane_b32 s66, v34, 34
39135 ; GFX8-NEXT: v_readlane_b32 s65, v34, 33
39136 ; GFX8-NEXT: v_readlane_b32 s64, v34, 32
39137 ; GFX8-NEXT: v_readlane_b32 s63, v34, 31
39138 ; GFX8-NEXT: v_readlane_b32 s62, v34, 30
39139 ; GFX8-NEXT: v_readlane_b32 s61, v34, 29
39140 ; GFX8-NEXT: v_readlane_b32 s60, v34, 28
39141 ; GFX8-NEXT: v_readlane_b32 s59, v34, 27
39142 ; GFX8-NEXT: v_readlane_b32 s58, v34, 26
39143 ; GFX8-NEXT: v_readlane_b32 s57, v34, 25
39144 ; GFX8-NEXT: v_readlane_b32 s56, v34, 24
39145 ; GFX8-NEXT: v_readlane_b32 s55, v34, 23
39146 ; GFX8-NEXT: v_readlane_b32 s54, v34, 22
39147 ; GFX8-NEXT: v_readlane_b32 s53, v34, 21
39148 ; GFX8-NEXT: v_readlane_b32 s52, v34, 20
39149 ; GFX8-NEXT: v_readlane_b32 s51, v34, 19
39150 ; GFX8-NEXT: v_readlane_b32 s50, v34, 18
39151 ; GFX8-NEXT: v_readlane_b32 s49, v34, 17
39152 ; GFX8-NEXT: v_readlane_b32 s48, v34, 16
39153 ; GFX8-NEXT: v_readlane_b32 s47, v34, 15
39154 ; GFX8-NEXT: v_readlane_b32 s46, v34, 14
39155 ; GFX8-NEXT: v_readlane_b32 s45, v34, 13
39156 ; GFX8-NEXT: v_readlane_b32 s44, v34, 12
39157 ; GFX8-NEXT: v_readlane_b32 s43, v34, 11
39158 ; GFX8-NEXT: v_readlane_b32 s42, v34, 10
39159 ; GFX8-NEXT: v_readlane_b32 s41, v34, 9
39160 ; GFX8-NEXT: v_readlane_b32 s40, v34, 8
39161 ; GFX8-NEXT: v_readlane_b32 s39, v34, 7
39162 ; GFX8-NEXT: v_readlane_b32 s38, v34, 6
39163 ; GFX8-NEXT: v_readlane_b32 s37, v34, 5
39164 ; GFX8-NEXT: v_readlane_b32 s36, v34, 4
39165 ; GFX8-NEXT: v_readlane_b32 s35, v34, 3
39166 ; GFX8-NEXT: v_readlane_b32 s34, v34, 2
39167 ; GFX8-NEXT: v_readlane_b32 s31, v34, 1
39168 ; GFX8-NEXT: v_readlane_b32 s30, v34, 0
39169 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
39170 ; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39171 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
39172 ; GFX8-NEXT: s_waitcnt vmcnt(0)
39173 ; GFX8-NEXT: s_setpc_b64 s[30:31]
39175 ; GFX9-LABEL: v_vselect_v32bf16:
39177 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39178 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
39179 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
39180 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
39181 ; GFX9-NEXT: v_writelane_b32 v33, s30, 0
39182 ; GFX9-NEXT: v_writelane_b32 v33, s31, 1
39183 ; GFX9-NEXT: v_writelane_b32 v33, s34, 2
39184 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
39185 ; GFX9-NEXT: v_writelane_b32 v33, s35, 3
39186 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
39187 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
39188 ; GFX9-NEXT: v_writelane_b32 v33, s36, 4
39189 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
39190 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
39191 ; GFX9-NEXT: v_writelane_b32 v33, s37, 5
39192 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
39193 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
39194 ; GFX9-NEXT: v_writelane_b32 v33, s38, 6
39195 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
39196 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
39197 ; GFX9-NEXT: v_writelane_b32 v33, s39, 7
39198 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
39199 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
39200 ; GFX9-NEXT: v_writelane_b32 v33, s40, 8
39201 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
39202 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
39203 ; GFX9-NEXT: v_writelane_b32 v33, s41, 9
39204 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
39205 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
39206 ; GFX9-NEXT: v_writelane_b32 v33, s42, 10
39207 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
39208 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
39209 ; GFX9-NEXT: v_writelane_b32 v33, s43, 11
39210 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
39211 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
39212 ; GFX9-NEXT: v_writelane_b32 v33, s44, 12
39213 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
39214 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
39215 ; GFX9-NEXT: v_writelane_b32 v33, s45, 13
39216 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
39217 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
39218 ; GFX9-NEXT: v_writelane_b32 v33, s46, 14
39219 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
39220 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
39221 ; GFX9-NEXT: v_writelane_b32 v33, s47, 15
39222 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
39223 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
39224 ; GFX9-NEXT: v_writelane_b32 v33, s48, 16
39225 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
39226 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
39227 ; GFX9-NEXT: v_writelane_b32 v33, s49, 17
39228 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
39229 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
39230 ; GFX9-NEXT: v_writelane_b32 v33, s50, 18
39231 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
39232 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
39233 ; GFX9-NEXT: v_writelane_b32 v33, s51, 19
39234 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
39235 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v19
39236 ; GFX9-NEXT: v_writelane_b32 v33, s52, 20
39237 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
39238 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
39239 ; GFX9-NEXT: v_writelane_b32 v33, s53, 21
39240 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
39241 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v21
39242 ; GFX9-NEXT: v_writelane_b32 v33, s54, 22
39243 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
39244 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
39245 ; GFX9-NEXT: v_writelane_b32 v33, s55, 23
39246 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
39247 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v23
39248 ; GFX9-NEXT: v_writelane_b32 v33, s56, 24
39249 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0
39250 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v22
39251 ; GFX9-NEXT: v_writelane_b32 v33, s57, 25
39252 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0
39253 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v25
39254 ; GFX9-NEXT: v_writelane_b32 v33, s58, 26
39255 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0
39256 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v24
39257 ; GFX9-NEXT: v_writelane_b32 v33, s59, 27
39258 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0
39259 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v27
39260 ; GFX9-NEXT: v_writelane_b32 v33, s60, 28
39261 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
39262 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v26
39263 ; GFX9-NEXT: v_writelane_b32 v33, s61, 29
39264 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
39265 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v29
39266 ; GFX9-NEXT: v_writelane_b32 v33, s62, 30
39267 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
39268 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v28
39269 ; GFX9-NEXT: v_writelane_b32 v33, s63, 31
39270 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
39271 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32
39272 ; GFX9-NEXT: v_writelane_b32 v33, s64, 32
39273 ; GFX9-NEXT: v_writelane_b32 v33, s65, 33
39274 ; GFX9-NEXT: v_writelane_b32 v33, s66, 34
39275 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
39276 ; GFX9-NEXT: v_writelane_b32 v33, s67, 35
39277 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
39278 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39279 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
39280 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0
39281 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v30
39282 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0
39283 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
39284 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
39285 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
39286 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
39287 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
39288 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
39289 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
39290 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
39291 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
39292 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
39293 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
39294 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
39295 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
39296 ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
39297 ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
39298 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
39299 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
39300 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
39301 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
39302 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
39303 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
39304 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
39305 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
39306 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
39307 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
39308 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
39309 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
39310 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
39311 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
39312 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60
39313 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
39314 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
39315 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39316 ; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[66:67]
39317 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32
39318 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
39319 ; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[64:65]
39320 ; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[62:63]
39321 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
39322 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
39323 ; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[60:61]
39324 ; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[58:59]
39325 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
39326 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
39327 ; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57]
39328 ; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[54:55]
39329 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
39330 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
39331 ; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53]
39332 ; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[50:51]
39333 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
39334 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
39335 ; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49]
39336 ; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[46:47]
39337 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
39338 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
39339 ; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45]
39340 ; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[42:43]
39341 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
39342 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
39343 ; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41]
39344 ; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[38:39]
39345 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
39346 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
39347 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37]
39348 ; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[34:35]
39349 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
39350 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
39351 ; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31]
39352 ; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29]
39353 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
39354 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
39355 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
39356 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25]
39357 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
39358 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
39359 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
39360 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21]
39361 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
39362 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
39363 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
39364 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17]
39365 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
39366 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
39367 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
39368 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13]
39369 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
39370 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
39371 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
39372 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9]
39373 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
39374 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
39375 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
39376 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5]
39377 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
39378 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
39379 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
39380 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
39381 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
39382 ; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4
39383 ; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4
39384 ; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4
39385 ; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4
39386 ; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4
39387 ; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4
39388 ; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4
39389 ; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4
39390 ; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4
39391 ; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4
39392 ; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4
39393 ; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4
39394 ; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4
39395 ; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4
39396 ; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4
39397 ; GFX9-NEXT: v_readlane_b32 s67, v33, 35
39398 ; GFX9-NEXT: v_readlane_b32 s66, v33, 34
39399 ; GFX9-NEXT: v_readlane_b32 s65, v33, 33
39400 ; GFX9-NEXT: v_readlane_b32 s64, v33, 32
39401 ; GFX9-NEXT: v_readlane_b32 s63, v33, 31
39402 ; GFX9-NEXT: v_readlane_b32 s62, v33, 30
39403 ; GFX9-NEXT: v_readlane_b32 s61, v33, 29
39404 ; GFX9-NEXT: v_readlane_b32 s60, v33, 28
39405 ; GFX9-NEXT: v_readlane_b32 s59, v33, 27
39406 ; GFX9-NEXT: v_readlane_b32 s58, v33, 26
39407 ; GFX9-NEXT: v_readlane_b32 s57, v33, 25
39408 ; GFX9-NEXT: v_readlane_b32 s56, v33, 24
39409 ; GFX9-NEXT: v_readlane_b32 s55, v33, 23
39410 ; GFX9-NEXT: v_readlane_b32 s54, v33, 22
39411 ; GFX9-NEXT: v_readlane_b32 s53, v33, 21
39412 ; GFX9-NEXT: v_readlane_b32 s52, v33, 20
39413 ; GFX9-NEXT: v_readlane_b32 s51, v33, 19
39414 ; GFX9-NEXT: v_readlane_b32 s50, v33, 18
39415 ; GFX9-NEXT: v_readlane_b32 s49, v33, 17
39416 ; GFX9-NEXT: v_readlane_b32 s48, v33, 16
39417 ; GFX9-NEXT: v_readlane_b32 s47, v33, 15
39418 ; GFX9-NEXT: v_readlane_b32 s46, v33, 14
39419 ; GFX9-NEXT: v_readlane_b32 s45, v33, 13
39420 ; GFX9-NEXT: v_readlane_b32 s44, v33, 12
39421 ; GFX9-NEXT: v_readlane_b32 s43, v33, 11
39422 ; GFX9-NEXT: v_readlane_b32 s42, v33, 10
39423 ; GFX9-NEXT: v_readlane_b32 s41, v33, 9
39424 ; GFX9-NEXT: v_readlane_b32 s40, v33, 8
39425 ; GFX9-NEXT: v_readlane_b32 s39, v33, 7
39426 ; GFX9-NEXT: v_readlane_b32 s38, v33, 6
39427 ; GFX9-NEXT: v_readlane_b32 s37, v33, 5
39428 ; GFX9-NEXT: v_readlane_b32 s36, v33, 4
39429 ; GFX9-NEXT: v_readlane_b32 s35, v33, 3
39430 ; GFX9-NEXT: v_readlane_b32 s34, v33, 2
39431 ; GFX9-NEXT: v_readlane_b32 s31, v33, 1
39432 ; GFX9-NEXT: v_readlane_b32 s30, v33, 0
39433 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
39434 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39435 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
39436 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39437 ; GFX9-NEXT: s_setpc_b64 s[30:31]
39439 ; GFX10-LABEL: v_vselect_v32bf16:
39441 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39442 ; GFX10-NEXT: s_clause 0xa
39443 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
39444 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
39445 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
39446 ; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
39447 ; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32
39448 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128
39449 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64
39450 ; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96
39451 ; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108
39452 ; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44
39453 ; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
39454 ; GFX10-NEXT: v_and_b32_e32 v30, 1, v30
39455 ; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
39456 ; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
39457 ; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
39458 ; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
39459 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
39460 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18
39461 ; GFX10-NEXT: v_and_b32_e32 v28, 1, v28
39462 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13
39463 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19
39464 ; GFX10-NEXT: v_and_b32_e32 v26, 1, v26
39465 ; GFX10-NEXT: v_and_b32_e32 v24, 1, v24
39466 ; GFX10-NEXT: v_and_b32_e32 v22, 1, v22
39467 ; GFX10-NEXT: v_and_b32_e32 v20, 1, v20
39468 ; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
39469 ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
39470 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
39471 ; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
39472 ; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
39473 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
39474 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
39475 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
39476 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
39477 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
39478 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
39479 ; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
39480 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
39481 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
39482 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
39483 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
39484 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
39485 ; GFX10-NEXT: s_waitcnt vmcnt(10)
39486 ; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31
39487 ; GFX10-NEXT: s_waitcnt vmcnt(9)
39488 ; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32
39489 ; GFX10-NEXT: s_waitcnt vmcnt(8)
39490 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33
39491 ; GFX10-NEXT: s_waitcnt vmcnt(7)
39492 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6
39493 ; GFX10-NEXT: s_waitcnt vmcnt(6)
39494 ; GFX10-NEXT: v_and_b32_e32 v35, 1, v35
39495 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12
39496 ; GFX10-NEXT: s_waitcnt vmcnt(4)
39497 ; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo
39498 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39499 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39500 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35
39501 ; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34
39502 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6
39503 ; GFX10-NEXT: s_clause 0x6
39504 ; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
39505 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
39506 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
39507 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
39508 ; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76
39509 ; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12
39510 ; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80
39511 ; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4
39512 ; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo
39513 ; GFX10-NEXT: s_clause 0x1
39514 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124
39515 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60
39516 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
39517 ; GFX10-NEXT: v_and_b32_e32 v28, 1, v29
39518 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5
39519 ; GFX10-NEXT: s_waitcnt vmcnt(3)
39520 ; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52
39521 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39522 ; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo
39523 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39524 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39525 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
39526 ; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo
39527 ; GFX10-NEXT: s_clause 0x1
39528 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120
39529 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
39530 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
39531 ; GFX10-NEXT: v_and_b32_e32 v26, 1, v27
39532 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39533 ; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
39534 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39535 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39536 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
39537 ; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
39538 ; GFX10-NEXT: s_clause 0x1
39539 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116
39540 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
39541 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
39542 ; GFX10-NEXT: v_and_b32_e32 v24, 1, v25
39543 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39544 ; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo
39545 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39546 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39547 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
39548 ; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo
39549 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
39550 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
39551 ; GFX10-NEXT: v_and_b32_e32 v22, 1, v23
39552 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
39553 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39554 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo
39555 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39556 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
39557 ; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53
39558 ; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo
39559 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
39560 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48
39561 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39
39562 ; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo
39563 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
39564 ; GFX10-NEXT: s_clause 0x1
39565 ; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
39566 ; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
39567 ; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo
39568 ; GFX10-NEXT: s_clause 0x1
39569 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100
39570 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
39571 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
39572 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39573 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo
39574 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
39575 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39576 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39577 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo
39578 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
39579 ; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
39580 ; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
39581 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo
39582 ; GFX10-NEXT: s_clause 0x1
39583 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88
39584 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
39585 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
39586 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo
39587 ; GFX10-NEXT: s_clause 0x1
39588 ; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84
39589 ; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
39590 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
39591 ; GFX10-NEXT: s_waitcnt vmcnt(2)
39592 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo
39593 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
39594 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39595 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39596 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39597 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo
39598 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
39599 ; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
39600 ; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
39601 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo
39602 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
39603 ; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48
39604 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo
39605 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
39606 ; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34
39607 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo
39608 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
39609 ; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33
39610 ; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32
39611 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo
39612 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
39613 ; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
39614 ; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
39615 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo
39616 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
39617 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo
39618 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
39619 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo
39620 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
39621 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo
39622 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
39623 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
39624 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo
39625 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
39626 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
39627 ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
39628 ; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100
39629 ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
39630 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo
39631 ; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
39632 ; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
39633 ; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
39634 ; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100
39635 ; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
39636 ; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
39637 ; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100
39638 ; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100
39639 ; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100
39640 ; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100
39641 ; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100
39642 ; GFX10-NEXT: s_setpc_b64 s[30:31]
39644 ; GFX11TRUE16-LABEL: v_vselect_v32bf16:
39645 ; GFX11TRUE16: ; %bb.0:
39646 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39647 ; GFX11TRUE16-NEXT: s_clause 0x1f
39648 ; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32
39649 ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
39650 ; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
39651 ; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
39652 ; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
39653 ; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
39654 ; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
39655 ; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
39656 ; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
39657 ; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
39658 ; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
39659 ; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
39660 ; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
39661 ; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
39662 ; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
39663 ; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
39664 ; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
39665 ; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
39666 ; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
39667 ; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
39668 ; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
39669 ; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
39670 ; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
39671 ; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
39672 ; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
39673 ; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
39674 ; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
39675 ; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
39676 ; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
39677 ; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
39678 ; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
39679 ; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
39680 ; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
39681 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
39682 ; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
39683 ; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
39684 ; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
39685 ; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
39686 ; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
39687 ; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
39688 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
39689 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
39690 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
39691 ; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
39692 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
39693 ; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
39694 ; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
39695 ; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
39696 ; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
39697 ; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
39698 ; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
39699 ; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
39700 ; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
39701 ; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
39702 ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
39703 ; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
39704 ; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
39705 ; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
39706 ; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
39707 ; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
39708 ; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
39709 ; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
39710 ; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
39711 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
39712 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
39713 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22
39714 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24
39715 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30
39716 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v26
39717 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v28
39718 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
39719 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
39720 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
39721 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
39722 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
39723 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
39724 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
39725 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
39726 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
39727 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10
39728 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
39729 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12
39730 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
39731 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
39732 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17
39733 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16
39734 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19
39735 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18
39736 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21
39737 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20
39738 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23
39739 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25
39740 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27
39741 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29
39742 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
39743 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
39744 ; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v31
39745 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
39746 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
39747 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
39748 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v33.l, s26
39749 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v33
39750 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
39751 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v35.l, s29
39752 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v35
39753 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v34
39754 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
39755 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v36.l, v37.l, s27
39756 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v37
39757 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v36
39758 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
39759 ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v38.l, v39.l, s24
39760 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v39
39761 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v38
39762 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
39763 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v48.l, v49.l, s22
39764 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v49
39765 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v48
39766 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
39767 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v53
39768 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v52
39769 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
39770 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v65
39771 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v64
39772 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v50.l, v51.l, s20
39773 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11)
39774 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v68
39775 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
39776 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v69
39777 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9)
39778 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v70
39779 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
39780 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v71
39781 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7)
39782 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v80
39783 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
39784 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v81
39785 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5)
39786 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v82
39787 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4)
39788 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v83
39789 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
39790 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v84
39791 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
39792 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v85
39793 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
39794 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v86
39795 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
39796 ; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v86.l, v87.l, s0
39797 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v87
39798 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v8
39799 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v51
39800 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v50
39801 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v52.l, v53.l, s18
39802 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v54.l, v55.l, s16
39803 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v55
39804 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v54
39805 ; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v64.l, v65.l, s14
39806 ; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v66.l, v67.l, s12
39807 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v67
39808 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v66
39809 ; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.l, v71.l, s8
39810 ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v82.l, v83.l, s4
39811 ; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v10.l, v9.l, s28
39812 ; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v12.l, v11.l, s25
39813 ; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v13.l, s23
39814 ; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v18.l, v15.l, s21
39815 ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v22.l, v21.l, s17
39816 ; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s13
39817 ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v30.l, v29.l, s9
39818 ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v32.l, v31.l, s7
39819 ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v34.l, v33.l, s5
39820 ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v36.l, v35.l, s3
39821 ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v38.l, v37.l, s1
39822 ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo
39823 ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, s0
39824 ; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v68.l, v69.l, s10
39825 ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v80.l, v81.l, s6
39826 ; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v84.l, v85.l, s2
39827 ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v20.l, v19.l, s19
39828 ; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v23.l, s15
39829 ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v28.l, v27.l, s11
39830 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.h
39831 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h
39832 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h
39833 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v21.l, v4.h
39834 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
39835 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v23.l, v3.h
39836 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v24.l, v3.l
39837 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v25.l, v2.h
39838 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v26.l, v2.l
39839 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v27.l, v1.h
39840 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v1.l
39841 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v29.l, v0.h
39842 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.l
39843 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v15.l
39844 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h
39845 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l
39846 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h
39847 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l
39848 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.h
39849 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.h
39850 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.h
39851 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.h
39852 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v31.l, v9.l
39853 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v32.l, v8.h
39854 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.l
39855 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h
39856 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v18, 0x5040100
39857 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v1, v7, 0x5040100
39858 ; GFX11TRUE16-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
39859 ; GFX11TRUE16-NEXT: v_perm_b32 v3, v3, v6, 0x5040100
39860 ; GFX11TRUE16-NEXT: v_perm_b32 v4, v4, v20, 0x5040100
39861 ; GFX11TRUE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
39862 ; GFX11TRUE16-NEXT: v_perm_b32 v6, v12, v21, 0x5040100
39863 ; GFX11TRUE16-NEXT: v_perm_b32 v7, v14, v22, 0x5040100
39864 ; GFX11TRUE16-NEXT: v_perm_b32 v8, v11, v23, 0x5040100
39865 ; GFX11TRUE16-NEXT: v_perm_b32 v9, v16, v24, 0x5040100
39866 ; GFX11TRUE16-NEXT: v_perm_b32 v10, v10, v25, 0x5040100
39867 ; GFX11TRUE16-NEXT: v_perm_b32 v11, v17, v26, 0x5040100
39868 ; GFX11TRUE16-NEXT: v_perm_b32 v12, v31, v27, 0x5040100
39869 ; GFX11TRUE16-NEXT: v_perm_b32 v13, v32, v28, 0x5040100
39870 ; GFX11TRUE16-NEXT: v_perm_b32 v14, v33, v29, 0x5040100
39871 ; GFX11TRUE16-NEXT: v_perm_b32 v15, v15, v30, 0x5040100
39872 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
39874 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
39875 ; GFX11FAKE16: ; %bb.0:
39876 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39877 ; GFX11FAKE16-NEXT: s_clause 0x1f
39878 ; GFX11FAKE16-NEXT: scratch_load_u16 v31, off, s32
39879 ; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
39880 ; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
39881 ; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
39882 ; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
39883 ; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
39884 ; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
39885 ; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
39886 ; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
39887 ; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
39888 ; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
39889 ; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
39890 ; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
39891 ; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
39892 ; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
39893 ; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
39894 ; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
39895 ; GFX11FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
39896 ; GFX11FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
39897 ; GFX11FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
39898 ; GFX11FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
39899 ; GFX11FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
39900 ; GFX11FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
39901 ; GFX11FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
39902 ; GFX11FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
39903 ; GFX11FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
39904 ; GFX11FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
39905 ; GFX11FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
39906 ; GFX11FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
39907 ; GFX11FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
39908 ; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
39909 ; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
39910 ; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
39911 ; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30
39912 ; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28
39913 ; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26
39914 ; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24
39915 ; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22
39916 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
39917 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
39918 ; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20
39919 ; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18
39920 ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16
39921 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30)
39922 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
39923 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
39924 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
39925 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
39926 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
39927 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
39928 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28)
39929 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo
39930 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
39931 ; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7
39932 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
39933 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
39934 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
39935 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26)
39936 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
39937 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
39938 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
39939 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39940 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39941 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
39942 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24)
39943 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo
39944 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
39945 ; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11
39946 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
39947 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
39948 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
39949 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22)
39950 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo
39951 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
39952 ; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
39953 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
39954 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
39955 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
39956 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20)
39957 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo
39958 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
39959 ; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
39960 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51
39961 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50
39962 ; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
39963 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18)
39964 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo
39965 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
39966 ; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
39967 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
39968 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52
39969 ; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
39970 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16)
39971 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo
39972 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55
39973 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
39974 ; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
39975 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
39976 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
39977 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14)
39978 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
39979 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
39980 ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17
39981 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
39982 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64
39983 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12)
39984 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo
39985 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
39986 ; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23
39987 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
39988 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66
39989 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10)
39990 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo
39991 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
39992 ; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21
39993 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
39994 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
39995 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8)
39996 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo
39997 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
39998 ; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27
39999 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71
40000 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70
40001 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6)
40002 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo
40003 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
40004 ; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25
40005 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81
40006 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80
40007 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4)
40008 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo
40009 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
40010 ; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31
40011 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83
40012 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
40013 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2)
40014 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo
40015 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
40016 ; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29
40017 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85
40018 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84
40019 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
40020 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo
40021 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
40022 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87
40023 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86
40024 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo
40025 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
40026 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo
40027 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
40028 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
40029 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
40030 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo
40031 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
40032 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo
40033 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
40034 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo
40035 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
40036 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo
40037 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
40038 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo
40039 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
40040 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo
40041 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
40042 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo
40043 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
40044 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo
40045 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
40046 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
40047 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
40048 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo
40049 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
40050 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo
40051 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
40052 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
40053 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
40054 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo
40055 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
40056 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
40057 ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
40058 ; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
40059 ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
40060 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo
40061 ; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
40062 ; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
40063 ; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
40064 ; GFX11FAKE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100
40065 ; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
40066 ; GFX11FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
40067 ; GFX11FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100
40068 ; GFX11FAKE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100
40069 ; GFX11FAKE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100
40070 ; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100
40071 ; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100
40072 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
40073 %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
40074 ret <32 x bfloat> %op
40077 declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
40078 declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
40079 declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
40080 declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
40082 define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
40083 ; GCN-LABEL: v_fma_bf16:
40085 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40086 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40087 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40088 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40089 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40090 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40091 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40092 ; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
40093 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40094 ; GCN-NEXT: s_setpc_b64 s[30:31]
40096 ; GFX7-LABEL: v_fma_bf16:
40098 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40099 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40100 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40101 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40102 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40103 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40104 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40105 ; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
40106 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40107 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40109 ; GFX8-LABEL: v_fma_bf16:
40111 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40112 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40113 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40114 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40115 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
40116 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40117 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40118 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40119 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
40120 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40121 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40122 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40123 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40125 ; GFX9-LABEL: v_fma_bf16:
40127 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40128 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40129 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40130 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40131 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
40132 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40133 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40134 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40135 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
40136 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40137 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40138 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40139 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40141 ; GFX10-LABEL: v_fma_bf16:
40143 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40144 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40145 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40146 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40147 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
40148 ; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1
40149 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2
40150 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40151 ; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
40152 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
40153 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40154 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40156 ; GFX11-LABEL: v_fma_bf16:
40158 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40159 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40160 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40161 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40162 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40163 ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
40164 ; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
40165 ; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v2
40166 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40167 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
40168 ; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
40169 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
40170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40171 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40172 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40173 %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
40177 define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
40178 ; GCN-LABEL: v_fma_v2bf16:
40180 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40181 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40182 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40183 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40184 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40185 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40186 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40187 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40188 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40189 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40190 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40191 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40192 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40193 ; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
40194 ; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
40195 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40196 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40197 ; GCN-NEXT: s_setpc_b64 s[30:31]
40199 ; GFX7-LABEL: v_fma_v2bf16:
40201 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40202 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40203 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
40204 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
40205 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40206 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40207 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
40208 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40209 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40210 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40211 ; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
40212 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
40213 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40214 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40215 ; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3
40216 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40217 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40218 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40220 ; GFX8-LABEL: v_fma_v2bf16:
40222 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40223 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40224 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40225 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40226 ; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
40227 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
40228 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
40229 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40230 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40231 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40232 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
40233 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
40234 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
40235 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40236 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40237 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
40238 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40239 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40240 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
40241 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40242 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40243 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40244 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
40245 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40247 ; GFX9-LABEL: v_fma_v2bf16:
40249 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40250 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40251 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40252 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40253 ; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
40254 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40255 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40256 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40257 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
40258 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40259 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
40260 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
40261 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
40262 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40263 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40264 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
40265 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40266 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
40267 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40268 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40269 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
40270 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
40271 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40273 ; GFX10-LABEL: v_fma_v2bf16:
40275 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40276 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40277 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40278 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40279 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40280 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40281 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40282 ; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
40283 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
40284 ; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
40285 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
40286 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
40287 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
40288 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
40289 ; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
40290 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
40291 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
40292 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40293 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
40294 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
40295 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40297 ; GFX11-LABEL: v_fma_v2bf16:
40299 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40300 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40301 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40302 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40303 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40304 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40305 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40306 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40307 ; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
40308 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
40309 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
40310 ; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1
40311 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
40312 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
40313 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
40314 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
40315 ; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
40316 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
40317 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
40318 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40319 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
40320 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40321 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
40322 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40323 %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
40324 ret <2 x bfloat> %op
40327 define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
40328 ; GCN-LABEL: v_fma_v3bf16:
40330 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40331 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40332 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40333 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
40334 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40335 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40336 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
40337 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40338 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40339 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
40340 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
40341 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40342 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40343 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
40344 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40345 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40346 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
40347 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40348 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40349 ; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
40350 ; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
40351 ; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
40352 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40353 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40354 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40355 ; GCN-NEXT: s_setpc_b64 s[30:31]
40357 ; GFX7-LABEL: v_fma_v3bf16:
40359 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40360 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40361 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
40362 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
40363 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40364 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
40365 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
40366 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
40367 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40368 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40369 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40370 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
40371 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
40372 ; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8
40373 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
40374 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40375 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40376 ; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5
40377 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
40378 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40379 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40380 ; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4
40381 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40382 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40383 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40384 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40386 ; GFX8-LABEL: v_fma_v3bf16:
40388 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40389 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
40390 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40391 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40392 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
40393 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
40394 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
40395 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
40396 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
40397 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40398 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40399 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40400 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40401 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
40402 ; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3
40403 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
40404 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
40405 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
40406 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40407 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40408 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40409 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
40410 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
40411 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
40412 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40413 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
40414 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
40415 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
40416 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
40417 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
40418 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40419 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40420 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40421 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
40422 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
40423 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40425 ; GFX9-LABEL: v_fma_v3bf16:
40427 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40428 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
40429 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40430 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40431 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
40432 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
40433 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40434 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
40435 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
40436 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40437 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40438 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40439 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40440 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0
40441 ; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3
40442 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40443 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40444 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40445 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
40446 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
40447 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
40448 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
40449 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40450 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
40451 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
40452 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
40453 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
40454 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40455 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40456 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
40457 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
40458 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
40459 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40461 ; GFX10-LABEL: v_fma_v3bf16:
40463 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40464 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
40465 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
40466 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
40467 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
40468 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40469 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40470 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40471 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40472 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40473 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
40474 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
40475 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
40476 ; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
40477 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
40478 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40479 ; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1
40480 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
40481 ; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
40482 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
40483 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
40484 ; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
40485 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
40486 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40487 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40488 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40489 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40490 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40491 ; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
40492 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16
40493 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40495 ; GFX11TRUE16-LABEL: v_fma_v3bf16:
40496 ; GFX11TRUE16: ; %bb.0:
40497 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40498 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
40499 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
40500 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
40501 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40502 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40503 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40504 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40505 ; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
40506 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
40507 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v0, v2
40508 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40509 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
40510 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
40511 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
40512 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
40513 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40514 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40515 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
40516 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
40517 ; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
40518 ; GFX11TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1
40519 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
40520 ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
40521 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
40522 ; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
40523 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40524 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40525 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40526 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40527 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40528 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40529 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
40530 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
40531 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v3, 16
40532 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
40534 ; GFX11FAKE16-LABEL: v_fma_v3bf16:
40535 ; GFX11FAKE16: ; %bb.0:
40536 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40537 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
40538 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
40539 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
40540 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40541 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40542 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40543 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40544 ; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
40545 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
40546 ; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
40547 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40548 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
40549 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
40550 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
40551 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
40552 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40553 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40554 ; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
40555 ; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1
40556 ; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
40557 ; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1
40558 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
40559 ; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
40560 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
40561 ; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
40562 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40563 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40564 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40565 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40566 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40567 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40568 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
40569 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
40570 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
40571 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
40572 %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
40573 ret <3 x bfloat> %op
40576 define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
40577 ; GCN-LABEL: v_fma_v4bf16:
40579 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40580 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40581 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40582 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
40583 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40584 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40585 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
40586 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40587 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
40588 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
40589 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40590 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
40591 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
40592 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
40593 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
40594 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40595 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
40596 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
40597 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40598 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
40599 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40600 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40601 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
40602 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40603 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40604 ; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
40605 ; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
40606 ; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
40607 ; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
40608 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40609 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40610 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40611 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40612 ; GCN-NEXT: s_setpc_b64 s[30:31]
40614 ; GFX7-LABEL: v_fma_v4bf16:
40616 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40617 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
40618 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
40619 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
40620 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40621 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
40622 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
40623 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
40624 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
40625 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40626 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40627 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
40628 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
40629 ; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11
40630 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
40631 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
40632 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40633 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40634 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
40635 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
40636 ; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7
40637 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
40638 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40639 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40640 ; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6
40641 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
40642 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40643 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40644 ; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5
40645 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40646 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40647 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40648 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40649 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40651 ; GFX8-LABEL: v_fma_v4bf16:
40653 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40654 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40655 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40656 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40657 ; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
40658 ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
40659 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
40660 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40661 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40662 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40663 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
40664 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
40665 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
40666 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
40667 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
40668 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
40669 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
40670 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
40671 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
40672 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
40673 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40674 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40675 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40676 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40677 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
40678 ; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
40679 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
40680 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
40681 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40682 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40683 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40684 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
40685 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
40686 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
40687 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40688 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
40689 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
40690 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
40691 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
40692 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
40693 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40694 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40695 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
40696 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40697 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
40698 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
40699 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40701 ; GFX9-LABEL: v_fma_v4bf16:
40703 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40704 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40705 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40706 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40707 ; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
40708 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40709 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40710 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40711 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
40712 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40713 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
40714 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
40715 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
40716 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
40717 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
40718 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
40719 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
40720 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
40721 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40722 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40723 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40724 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40725 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
40726 ; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
40727 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40728 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40729 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40730 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
40731 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
40732 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
40733 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
40734 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40735 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
40736 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
40737 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
40738 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
40739 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40740 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40741 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
40742 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
40743 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
40744 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40746 ; GFX10-LABEL: v_fma_v4bf16:
40748 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40749 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40750 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40751 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40752 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40753 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40754 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40755 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
40756 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
40757 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
40758 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
40759 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40760 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40761 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40762 ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
40763 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
40764 ; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
40765 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
40766 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
40767 ; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
40768 ; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
40769 ; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
40770 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40771 ; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
40772 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
40773 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
40774 ; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
40775 ; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
40776 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7
40777 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
40778 ; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
40779 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
40780 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
40781 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40782 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
40783 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40784 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
40785 ; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
40786 ; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
40787 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40789 ; GFX11-LABEL: v_fma_v4bf16:
40791 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40792 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40793 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40794 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
40795 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40796 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40797 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40798 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40799 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40800 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
40801 ; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3
40802 ; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
40803 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40804 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
40805 ; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
40806 ; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v6
40807 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40808 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
40809 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40810 ; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2
40811 ; GFX11-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
40812 ; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
40813 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
40814 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
40815 ; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8
40816 ; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
40817 ; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
40818 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
40819 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
40820 ; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1
40821 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
40822 ; GFX11-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
40823 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
40824 ; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
40825 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7
40826 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40827 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
40828 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40829 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
40830 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40831 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
40832 ; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
40833 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
40834 ; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
40835 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40836 %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
40837 ret <4 x bfloat> %op
40840 declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
40841 declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
40842 declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
40843 declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
40845 define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
40846 ; GCN-LABEL: v_fmuladd_bf16:
40848 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40849 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40850 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40851 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40852 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40853 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40854 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
40855 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40856 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
40857 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1
40858 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40859 ; GCN-NEXT: s_setpc_b64 s[30:31]
40861 ; GFX7-LABEL: v_fmuladd_bf16:
40863 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40864 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40865 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40866 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40867 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40868 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40869 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
40870 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40871 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
40872 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
40873 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40874 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40876 ; GFX8-LABEL: v_fmuladd_bf16:
40878 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40879 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40880 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40881 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
40882 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40883 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40884 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40885 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
40886 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40887 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
40888 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40889 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
40890 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
40891 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40892 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40893 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40894 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
40895 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40896 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40897 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40898 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40900 ; GFX9-LABEL: v_fmuladd_bf16:
40902 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40903 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40904 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40905 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
40906 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40907 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40908 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40909 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
40910 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40911 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
40912 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40913 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
40914 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
40915 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40916 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40917 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
40918 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40919 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40920 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40921 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40923 ; GFX10-LABEL: v_fmuladd_bf16:
40925 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40926 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40927 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40928 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
40929 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
40930 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
40931 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40932 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40933 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
40934 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
40935 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40936 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
40937 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
40938 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
40939 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40940 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40941 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
40942 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40943 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40945 ; GFX11-LABEL: v_fmuladd_bf16:
40947 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40948 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40949 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40950 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40951 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
40952 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
40953 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
40954 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40955 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
40956 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40957 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
40958 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40959 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40960 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
40961 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40962 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
40963 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
40964 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40965 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40966 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40967 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
40968 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40969 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40970 %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
40974 define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
40975 ; GCN-LABEL: v_fmuladd_v2bf16:
40977 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40978 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40979 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40980 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40981 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40982 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40983 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40984 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40985 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40986 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40987 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40988 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40989 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40990 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
40991 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
40992 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40993 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40994 ; GCN-NEXT: v_add_f32_e32 v1, v1, v5
40995 ; GCN-NEXT: v_add_f32_e32 v0, v0, v4
40996 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40997 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40998 ; GCN-NEXT: s_setpc_b64 s[30:31]
41000 ; GFX7-LABEL: v_fmuladd_v2bf16:
41002 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41003 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
41004 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
41005 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
41006 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
41007 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41008 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41009 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41010 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41011 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
41012 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
41013 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
41014 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
41015 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41016 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
41017 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41018 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41019 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
41020 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
41021 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41022 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41023 ; GFX7-NEXT: s_setpc_b64 s[30:31]
41025 ; GFX8-LABEL: v_fmuladd_v2bf16:
41027 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41028 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41029 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41030 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
41031 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
41032 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
41033 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
41034 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
41035 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41036 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41037 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41038 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
41039 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
41040 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
41041 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
41042 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
41043 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41044 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41045 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
41046 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
41047 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
41048 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41049 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
41050 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41051 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
41052 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
41053 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
41054 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41055 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
41056 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41057 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
41058 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
41059 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
41060 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
41061 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
41062 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
41063 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41064 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
41065 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
41066 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
41067 ; GFX8-NEXT: s_setpc_b64 s[30:31]
41069 ; GFX9-LABEL: v_fmuladd_v2bf16:
41071 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41072 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41073 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41074 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
41075 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
41076 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
41077 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
41078 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
41079 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41080 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41081 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41082 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
41083 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
41084 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41085 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41086 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
41087 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
41088 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
41089 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
41090 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41091 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
41092 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41093 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
41094 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
41095 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41096 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
41097 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41098 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
41099 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
41100 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
41101 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
41102 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
41103 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41104 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
41105 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
41106 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
41107 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41109 ; GFX10-LABEL: v_fmuladd_v2bf16:
41111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41112 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41113 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41114 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41115 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41116 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
41117 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
41118 ; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
41119 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
41120 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
41121 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41122 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
41123 ; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
41124 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41125 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
41126 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41127 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
41128 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41129 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41130 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
41131 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
41132 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41133 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
41134 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
41135 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
41136 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41137 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
41138 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41139 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
41140 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
41141 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
41142 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41143 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
41144 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
41145 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41147 ; GFX11-LABEL: v_fmuladd_v2bf16:
41149 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41150 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41151 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41152 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41153 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
41154 ; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
41155 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
41156 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41157 ; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
41158 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
41159 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41160 ; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
41161 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41162 ; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
41163 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
41164 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
41165 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
41166 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41167 ; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
41168 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41169 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41170 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41171 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
41172 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
41173 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41174 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
41175 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41176 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41177 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
41178 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
41179 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
41180 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41181 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41182 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
41183 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
41184 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41185 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
41186 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41187 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
41188 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
41189 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
41190 ; GFX11-NEXT: s_setpc_b64 s[30:31]
41191 %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
41192 ret <2 x bfloat> %op
41195 define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
41196 ; GCN-LABEL: v_fmuladd_v3bf16:
41198 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41199 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
41200 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
41201 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
41202 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
41203 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
41204 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
41205 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
41206 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
41207 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
41208 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41209 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41210 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
41211 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41212 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41213 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
41214 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41215 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41216 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41217 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
41218 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
41219 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
41220 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41221 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41222 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41223 ; GCN-NEXT: v_add_f32_e32 v2, v2, v8
41224 ; GCN-NEXT: v_add_f32_e32 v1, v1, v7
41225 ; GCN-NEXT: v_add_f32_e32 v0, v0, v6
41226 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41227 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41228 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41229 ; GCN-NEXT: s_setpc_b64 s[30:31]
41231 ; GFX7-LABEL: v_fmuladd_v3bf16:
41233 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41234 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
41235 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
41236 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
41237 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
41238 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
41239 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
41240 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41241 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41242 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41243 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41244 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41245 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41246 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
41247 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
41248 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
41249 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
41250 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
41251 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
41252 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41253 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
41254 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41255 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
41256 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41257 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
41258 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
41259 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
41260 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
41261 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41262 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41263 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41264 ; GFX7-NEXT: s_setpc_b64 s[30:31]
41266 ; GFX8-LABEL: v_fmuladd_v3bf16:
41268 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41269 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41270 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41271 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
41272 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41273 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41274 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
41275 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
41276 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41277 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
41278 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41279 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41280 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
41281 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41282 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
41283 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41284 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
41285 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
41286 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41287 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41288 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41289 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41290 ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
41291 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41292 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41293 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41294 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
41295 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41296 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41297 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41298 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41299 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
41300 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41301 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41302 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41303 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41304 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41305 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
41306 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
41307 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41308 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41309 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41310 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41311 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
41312 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
41313 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41314 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41315 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41316 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41317 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
41318 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41319 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41320 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
41321 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
41322 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41323 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41324 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
41325 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
41326 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
41327 ; GFX8-NEXT: s_setpc_b64 s[30:31]
41329 ; GFX9-LABEL: v_fmuladd_v3bf16:
41331 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41332 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41333 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41334 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
41335 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41336 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
41337 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41338 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
41339 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41340 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
41341 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41342 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41343 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
41344 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41345 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41346 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
41347 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41348 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41349 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41350 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41351 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
41352 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41353 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41354 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
41355 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41356 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41357 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41358 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41359 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
41360 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41361 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41362 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41363 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
41364 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41365 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
41366 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41367 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41368 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41369 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41370 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
41371 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41372 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41373 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41374 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41375 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
41376 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41377 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41378 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
41379 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41380 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41381 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
41382 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
41383 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
41384 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41386 ; GFX10-LABEL: v_fmuladd_v3bf16:
41388 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41389 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41390 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41391 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
41392 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
41393 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41394 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41395 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
41396 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
41397 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
41398 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
41399 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
41400 ; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
41401 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41402 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
41403 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41404 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
41405 ; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
41406 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0
41407 ; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
41408 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41409 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41410 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41411 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41412 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41413 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41414 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41415 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41416 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
41417 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41418 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41419 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
41420 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
41421 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41422 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
41423 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
41424 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
41425 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41426 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
41427 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
41428 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
41429 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
41430 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
41431 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41432 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
41433 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41434 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41435 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41436 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41437 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41438 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
41439 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41441 ; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
41442 ; GFX11TRUE16: ; %bb.0:
41443 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41444 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
41445 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
41446 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41447 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41448 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41449 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
41450 ; GFX11TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
41451 ; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
41452 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41453 ; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
41454 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41455 ; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v3
41456 ; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v7, v6
41457 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41458 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
41459 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41460 ; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
41461 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41462 ; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
41463 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41464 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
41465 ; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
41466 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41467 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41468 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41469 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41470 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41471 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41472 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41473 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41474 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41475 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41476 ; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
41477 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41478 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41479 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41480 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
41481 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41482 ; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
41483 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
41484 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41485 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
41486 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
41487 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
41488 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
41489 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
41490 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41491 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
41492 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
41493 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41494 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41495 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41496 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41497 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41498 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41499 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41500 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
41501 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
41502 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
41504 ; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
41505 ; GFX11FAKE16: ; %bb.0:
41506 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41507 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
41508 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
41509 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41510 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41511 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41512 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
41513 ; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
41514 ; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
41515 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41516 ; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
41517 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41518 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, v1, v3
41519 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v7, v6
41520 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41521 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
41522 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41523 ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
41524 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41525 ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
41526 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41527 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
41528 ; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
41529 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41530 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41531 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41532 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41533 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41534 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41535 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41536 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41537 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41538 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41539 ; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v5
41540 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41541 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41542 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41543 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
41544 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41545 ; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
41546 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
41547 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41548 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
41549 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
41550 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
41551 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
41552 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
41553 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41554 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
41555 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
41556 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41557 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41558 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41559 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41560 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41561 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41562 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41563 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
41564 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
41565 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
41566 %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
41567 ret <3 x bfloat> %op
41570 define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
41571 ; GCN-LABEL: v_fmuladd_v4bf16:
41573 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41574 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
41575 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
41576 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
41577 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
41578 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
41579 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
41580 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
41581 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
41582 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
41583 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
41584 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
41585 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
41586 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
41587 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41588 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
41589 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41590 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41591 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
41592 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41593 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41594 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
41595 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41596 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41597 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
41598 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
41599 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
41600 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
41601 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
41602 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41603 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41604 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41605 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41606 ; GCN-NEXT: v_add_f32_e32 v3, v3, v11
41607 ; GCN-NEXT: v_add_f32_e32 v2, v2, v10
41608 ; GCN-NEXT: v_add_f32_e32 v1, v1, v9
41609 ; GCN-NEXT: v_add_f32_e32 v0, v0, v8
41610 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41611 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41612 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41613 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41614 ; GCN-NEXT: s_setpc_b64 s[30:31]
41616 ; GFX7-LABEL: v_fmuladd_v4bf16:
41618 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41619 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
41620 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
41621 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
41622 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
41623 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
41624 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
41625 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
41626 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
41627 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
41628 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41629 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41630 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41631 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41632 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41633 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41634 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41635 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
41636 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
41637 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
41638 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
41639 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
41640 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
41641 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
41642 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
41643 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41644 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v11
41645 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41646 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
41647 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41648 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
41649 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41650 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
41651 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
41652 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
41653 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
41654 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
41655 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41656 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41657 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41658 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41659 ; GFX7-NEXT: s_setpc_b64 s[30:31]
41661 ; GFX8-LABEL: v_fmuladd_v4bf16:
41663 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41664 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41665 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41666 ; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6
41667 ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
41668 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
41669 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
41670 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
41671 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41672 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41673 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41674 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
41675 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
41676 ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
41677 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
41678 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
41679 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41680 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41681 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
41682 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
41683 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
41684 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41685 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41686 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41687 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41688 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
41689 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
41690 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41691 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
41692 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41693 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
41694 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
41695 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41696 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41697 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
41698 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
41699 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41700 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41701 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41702 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41703 ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
41704 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41705 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41706 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41707 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
41708 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41709 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41710 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41711 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41712 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
41713 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41714 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41715 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41716 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41717 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41718 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
41719 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
41720 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41721 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41722 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41723 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41724 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
41725 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
41726 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41727 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41728 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41729 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41730 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
41731 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41732 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41733 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
41734 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
41735 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41736 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41737 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
41738 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
41739 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
41740 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
41741 ; GFX8-NEXT: s_setpc_b64 s[30:31]
41743 ; GFX9-LABEL: v_fmuladd_v4bf16:
41745 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41746 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41747 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41748 ; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
41749 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
41750 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
41751 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
41752 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
41753 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41754 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41755 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41756 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
41757 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
41758 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41759 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41760 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
41761 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
41762 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
41763 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
41764 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41765 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41766 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41767 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41768 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
41769 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41770 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
41771 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41772 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
41773 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
41774 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41775 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41776 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
41777 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41778 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41779 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41780 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41781 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
41782 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41783 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41784 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
41785 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41786 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41787 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41788 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41789 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
41790 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41791 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41792 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41793 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
41794 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41795 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
41796 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41797 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41798 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41799 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41800 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
41801 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41802 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41803 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41804 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41805 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
41806 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41807 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41808 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
41809 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41810 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41811 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
41812 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
41813 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
41814 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41816 ; GFX10-LABEL: v_fmuladd_v4bf16:
41818 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41819 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41820 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41821 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41822 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41823 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
41824 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41825 ; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
41826 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
41827 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41828 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
41829 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
41830 ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
41831 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
41832 ; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7
41833 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
41834 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
41835 ; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
41836 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
41837 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
41838 ; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1
41839 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41840 ; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1
41841 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo
41842 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41843 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7
41844 ; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
41845 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
41846 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41847 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41848 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
41849 ; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
41850 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41851 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v8
41852 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41853 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
41854 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41855 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
41856 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41857 ; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
41858 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41859 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
41860 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
41861 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
41862 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41863 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
41864 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41865 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
41866 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
41867 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
41868 ; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
41869 ; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
41870 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
41871 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
41872 ; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
41873 ; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
41874 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
41875 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41876 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
41877 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
41878 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
41879 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41880 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
41881 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41882 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41883 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
41884 ; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
41885 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41887 ; GFX11-LABEL: v_fmuladd_v4bf16:
41889 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41890 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
41891 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41892 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41893 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41894 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
41895 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41896 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41897 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41898 ; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
41899 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2
41900 ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2
41901 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
41902 ; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
41903 ; GFX11-NEXT: v_mul_f32_e32 v7, v9, v7
41904 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v6
41905 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
41906 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41907 ; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
41908 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
41909 ; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
41910 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
41911 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
41912 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41913 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v7
41914 ; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
41915 ; GFX11-NEXT: v_bfe_u32 v11, v0, 16, 1
41916 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41917 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0
41918 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
41919 ; GFX11-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
41920 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
41921 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
41922 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41923 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41924 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
41925 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41926 ; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
41927 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
41928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
41929 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
41930 ; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
41931 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41932 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41933 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41934 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
41935 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
41936 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
41937 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v8
41938 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
41939 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41940 ; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
41941 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
41942 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41943 ; GFX11-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
41944 ; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
41945 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41946 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
41947 ; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
41948 ; GFX11-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
41949 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
41950 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41951 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
41952 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
41953 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41954 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
41955 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41956 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
41957 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41958 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41959 ; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41960 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
41961 ; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
41962 ; GFX11-NEXT: s_setpc_b64 s[30:31]
41963 %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
41964 ret <4 x bfloat> %op