1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
4 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
5 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
6 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
8 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
10 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
11 ; GCN-LABEL: test_load_store:
13 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; GCN-NEXT: s_mov_b32 s6, 0
15 ; GCN-NEXT: s_mov_b32 s7, 0xf000
16 ; GCN-NEXT: s_mov_b32 s4, s6
17 ; GCN-NEXT: s_mov_b32 s5, s6
18 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
19 ; GCN-NEXT: s_waitcnt vmcnt(0)
20 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
21 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
22 ; GCN-NEXT: s_setpc_b64 s[30:31]
24 ; GFX7-LABEL: test_load_store:
26 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX7-NEXT: s_mov_b32 s6, 0
28 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
29 ; GFX7-NEXT: s_mov_b32 s4, s6
30 ; GFX7-NEXT: s_mov_b32 s5, s6
31 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
32 ; GFX7-NEXT: s_waitcnt vmcnt(0)
33 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
34 ; GFX7-NEXT: s_waitcnt vmcnt(0)
35 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37 ; GFX8-LABEL: test_load_store:
39 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
41 ; GFX8-NEXT: s_waitcnt vmcnt(0)
42 ; GFX8-NEXT: flat_store_short v[2:3], v0
43 ; GFX8-NEXT: s_waitcnt vmcnt(0)
44 ; GFX8-NEXT: s_setpc_b64 s[30:31]
46 ; GFX9-LABEL: test_load_store:
48 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
50 ; GFX9-NEXT: s_waitcnt vmcnt(0)
51 ; GFX9-NEXT: global_store_short v[2:3], v0, off
52 ; GFX9-NEXT: s_waitcnt vmcnt(0)
53 ; GFX9-NEXT: s_setpc_b64 s[30:31]
55 ; GFX10-LABEL: test_load_store:
57 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: global_store_short v[2:3], v0, off
61 ; GFX10-NEXT: s_setpc_b64 s[30:31]
63 ; GFX11-LABEL: test_load_store:
65 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
67 ; GFX11-NEXT: s_waitcnt vmcnt(0)
68 ; GFX11-NEXT: global_store_b16 v[2:3], v0, off
69 ; GFX11-NEXT: s_setpc_b64 s[30:31]
70 %val = load bfloat, ptr addrspace(1) %in
71 store bfloat %val, ptr addrspace(1) %out
75 define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
76 ; GCN-LABEL: v_load_global_v2bf16:
78 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79 ; GCN-NEXT: s_mov_b32 s6, 0
80 ; GCN-NEXT: s_mov_b32 s7, 0xf000
81 ; GCN-NEXT: s_mov_b32 s4, s6
82 ; GCN-NEXT: s_mov_b32 s5, s6
83 ; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
84 ; GCN-NEXT: s_waitcnt vmcnt(0)
85 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
86 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
87 ; GCN-NEXT: s_setpc_b64 s[30:31]
89 ; GFX7-LABEL: v_load_global_v2bf16:
91 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX7-NEXT: s_mov_b32 s6, 0
93 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
94 ; GFX7-NEXT: s_mov_b32 s4, s6
95 ; GFX7-NEXT: s_mov_b32 s5, s6
96 ; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
97 ; GFX7-NEXT: s_waitcnt vmcnt(0)
98 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
99 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
100 ; GFX7-NEXT: s_setpc_b64 s[30:31]
102 ; GFX8-LABEL: v_load_global_v2bf16:
104 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
106 ; GFX8-NEXT: s_waitcnt vmcnt(0)
107 ; GFX8-NEXT: s_setpc_b64 s[30:31]
109 ; GFX9-LABEL: v_load_global_v2bf16:
111 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
113 ; GFX9-NEXT: s_waitcnt vmcnt(0)
114 ; GFX9-NEXT: s_setpc_b64 s[30:31]
116 ; GFX10-LABEL: v_load_global_v2bf16:
118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
121 ; GFX10-NEXT: s_setpc_b64 s[30:31]
123 ; GFX11-LABEL: v_load_global_v2bf16:
125 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
127 ; GFX11-NEXT: s_waitcnt vmcnt(0)
128 ; GFX11-NEXT: s_setpc_b64 s[30:31]
129 %load = load <2 x bfloat>, ptr addrspace(1) %ptr
130 ret <2 x bfloat> %load
133 define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
134 ; GCN-LABEL: v_load_global_v3bf16:
136 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GCN-NEXT: s_mov_b32 s6, 0
138 ; GCN-NEXT: s_mov_b32 s7, 0xf000
139 ; GCN-NEXT: s_mov_b32 s4, s6
140 ; GCN-NEXT: s_mov_b32 s5, s6
141 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
142 ; GCN-NEXT: s_waitcnt vmcnt(0)
143 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
144 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
145 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
146 ; GCN-NEXT: s_setpc_b64 s[30:31]
148 ; GFX7-LABEL: v_load_global_v3bf16:
150 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151 ; GFX7-NEXT: s_mov_b32 s6, 0
152 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
153 ; GFX7-NEXT: s_mov_b32 s4, s6
154 ; GFX7-NEXT: s_mov_b32 s5, s6
155 ; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
156 ; GFX7-NEXT: s_waitcnt vmcnt(0)
157 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
158 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
159 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
160 ; GFX7-NEXT: s_setpc_b64 s[30:31]
162 ; GFX8-LABEL: v_load_global_v3bf16:
164 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
166 ; GFX8-NEXT: s_waitcnt vmcnt(0)
167 ; GFX8-NEXT: s_setpc_b64 s[30:31]
169 ; GFX9-LABEL: v_load_global_v3bf16:
171 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
173 ; GFX9-NEXT: s_waitcnt vmcnt(0)
174 ; GFX9-NEXT: s_setpc_b64 s[30:31]
176 ; GFX10-LABEL: v_load_global_v3bf16:
178 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
180 ; GFX10-NEXT: s_waitcnt vmcnt(0)
181 ; GFX10-NEXT: s_setpc_b64 s[30:31]
183 ; GFX11-LABEL: v_load_global_v3bf16:
185 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
187 ; GFX11-NEXT: s_waitcnt vmcnt(0)
188 ; GFX11-NEXT: s_setpc_b64 s[30:31]
189 %load = load <3 x bfloat>, ptr addrspace(1) %ptr
190 ret <3 x bfloat> %load
193 define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
194 ; GCN-LABEL: v_load_global_v4bf16:
196 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197 ; GCN-NEXT: s_mov_b32 s6, 0
198 ; GCN-NEXT: s_mov_b32 s7, 0xf000
199 ; GCN-NEXT: s_mov_b32 s4, s6
200 ; GCN-NEXT: s_mov_b32 s5, s6
201 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
202 ; GCN-NEXT: s_waitcnt vmcnt(0)
203 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
204 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
205 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
206 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
207 ; GCN-NEXT: s_setpc_b64 s[30:31]
209 ; GFX7-LABEL: v_load_global_v4bf16:
211 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX7-NEXT: s_mov_b32 s6, 0
213 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
214 ; GFX7-NEXT: s_mov_b32 s4, s6
215 ; GFX7-NEXT: s_mov_b32 s5, s6
216 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
217 ; GFX7-NEXT: s_waitcnt vmcnt(0)
218 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
219 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
220 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
221 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
222 ; GFX7-NEXT: s_setpc_b64 s[30:31]
224 ; GFX8-LABEL: v_load_global_v4bf16:
226 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
228 ; GFX8-NEXT: s_waitcnt vmcnt(0)
229 ; GFX8-NEXT: s_setpc_b64 s[30:31]
231 ; GFX9-LABEL: v_load_global_v4bf16:
233 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
235 ; GFX9-NEXT: s_waitcnt vmcnt(0)
236 ; GFX9-NEXT: s_setpc_b64 s[30:31]
238 ; GFX10-LABEL: v_load_global_v4bf16:
240 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
242 ; GFX10-NEXT: s_waitcnt vmcnt(0)
243 ; GFX10-NEXT: s_setpc_b64 s[30:31]
245 ; GFX11-LABEL: v_load_global_v4bf16:
247 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
249 ; GFX11-NEXT: s_waitcnt vmcnt(0)
250 ; GFX11-NEXT: s_setpc_b64 s[30:31]
251 %load = load <4 x bfloat>, ptr addrspace(1) %ptr
252 ret <4 x bfloat> %load
255 define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
256 ; GCN-LABEL: v_load_global_v6bf16:
258 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259 ; GCN-NEXT: s_mov_b32 s6, 0
260 ; GCN-NEXT: s_mov_b32 s7, 0xf000
261 ; GCN-NEXT: s_mov_b32 s4, s6
262 ; GCN-NEXT: s_mov_b32 s5, s6
263 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
264 ; GCN-NEXT: s_waitcnt vmcnt(0)
265 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
266 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
267 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
268 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
269 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
270 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
271 ; GCN-NEXT: s_setpc_b64 s[30:31]
273 ; GFX7-LABEL: v_load_global_v6bf16:
275 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276 ; GFX7-NEXT: s_mov_b32 s6, 0
277 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
278 ; GFX7-NEXT: s_mov_b32 s4, s6
279 ; GFX7-NEXT: s_mov_b32 s5, s6
280 ; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
281 ; GFX7-NEXT: s_waitcnt vmcnt(0)
282 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
283 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
284 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
285 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
286 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
287 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
288 ; GFX7-NEXT: s_setpc_b64 s[30:31]
290 ; GFX8-LABEL: v_load_global_v6bf16:
292 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
294 ; GFX8-NEXT: s_waitcnt vmcnt(0)
295 ; GFX8-NEXT: s_setpc_b64 s[30:31]
297 ; GFX9-LABEL: v_load_global_v6bf16:
299 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
301 ; GFX9-NEXT: s_waitcnt vmcnt(0)
302 ; GFX9-NEXT: s_setpc_b64 s[30:31]
304 ; GFX10-LABEL: v_load_global_v6bf16:
306 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
308 ; GFX10-NEXT: s_waitcnt vmcnt(0)
309 ; GFX10-NEXT: s_setpc_b64 s[30:31]
311 ; GFX11-LABEL: v_load_global_v6bf16:
313 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314 ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
315 ; GFX11-NEXT: s_waitcnt vmcnt(0)
316 ; GFX11-NEXT: s_setpc_b64 s[30:31]
317 %load = load <6 x bfloat>, ptr addrspace(1) %ptr
318 ret <6 x bfloat> %load
321 define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
322 ; GCN-LABEL: v_load_global_v8bf16:
324 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; GCN-NEXT: s_mov_b32 s6, 0
326 ; GCN-NEXT: s_mov_b32 s7, 0xf000
327 ; GCN-NEXT: s_mov_b32 s4, s6
328 ; GCN-NEXT: s_mov_b32 s5, s6
329 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
330 ; GCN-NEXT: s_waitcnt vmcnt(0)
331 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
332 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
333 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
334 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
335 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
336 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
337 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
338 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
339 ; GCN-NEXT: s_setpc_b64 s[30:31]
341 ; GFX7-LABEL: v_load_global_v8bf16:
343 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; GFX7-NEXT: s_mov_b32 s6, 0
345 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
346 ; GFX7-NEXT: s_mov_b32 s4, s6
347 ; GFX7-NEXT: s_mov_b32 s5, s6
348 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
349 ; GFX7-NEXT: s_waitcnt vmcnt(0)
350 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
351 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
352 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
353 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
354 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
355 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
356 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
357 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
358 ; GFX7-NEXT: s_setpc_b64 s[30:31]
360 ; GFX8-LABEL: v_load_global_v8bf16:
362 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
364 ; GFX8-NEXT: s_waitcnt vmcnt(0)
365 ; GFX8-NEXT: s_setpc_b64 s[30:31]
367 ; GFX9-LABEL: v_load_global_v8bf16:
369 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
371 ; GFX9-NEXT: s_waitcnt vmcnt(0)
372 ; GFX9-NEXT: s_setpc_b64 s[30:31]
374 ; GFX10-LABEL: v_load_global_v8bf16:
376 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
378 ; GFX10-NEXT: s_waitcnt vmcnt(0)
379 ; GFX10-NEXT: s_setpc_b64 s[30:31]
381 ; GFX11-LABEL: v_load_global_v8bf16:
383 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
385 ; GFX11-NEXT: s_waitcnt vmcnt(0)
386 ; GFX11-NEXT: s_setpc_b64 s[30:31]
387 %load = load <8 x bfloat>, ptr addrspace(1) %ptr
388 ret <8 x bfloat> %load
391 define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
392 ; GCN-LABEL: v_load_global_v16bf16:
394 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395 ; GCN-NEXT: s_mov_b32 s6, 0
396 ; GCN-NEXT: s_mov_b32 s7, 0xf000
397 ; GCN-NEXT: s_mov_b32 s4, s6
398 ; GCN-NEXT: s_mov_b32 s5, s6
399 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
400 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
401 ; GCN-NEXT: s_waitcnt vmcnt(1)
402 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
403 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
404 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
405 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
406 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
407 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
408 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
409 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
410 ; GCN-NEXT: s_waitcnt vmcnt(0)
411 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
412 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
413 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
414 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
415 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
416 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
417 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
418 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
419 ; GCN-NEXT: s_setpc_b64 s[30:31]
421 ; GFX7-LABEL: v_load_global_v16bf16:
423 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; GFX7-NEXT: s_mov_b32 s6, 0
425 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
426 ; GFX7-NEXT: s_mov_b32 s4, s6
427 ; GFX7-NEXT: s_mov_b32 s5, s6
428 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
429 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
430 ; GFX7-NEXT: s_waitcnt vmcnt(1)
431 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
432 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
433 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
434 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
435 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
436 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
437 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
438 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
439 ; GFX7-NEXT: s_waitcnt vmcnt(0)
440 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
441 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
442 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
443 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
444 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
445 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
446 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
447 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
448 ; GFX7-NEXT: s_setpc_b64 s[30:31]
450 ; GFX8-LABEL: v_load_global_v16bf16:
452 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GFX8-NEXT: v_mov_b32_e32 v5, v1
454 ; GFX8-NEXT: v_mov_b32_e32 v4, v0
455 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
456 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4
457 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
458 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
459 ; GFX8-NEXT: s_waitcnt vmcnt(0)
460 ; GFX8-NEXT: s_setpc_b64 s[30:31]
462 ; GFX9-LABEL: v_load_global_v16bf16:
464 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465 ; GFX9-NEXT: v_mov_b32_e32 v9, v1
466 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
467 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
468 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: s_setpc_b64 s[30:31]
472 ; GFX10-LABEL: v_load_global_v16bf16:
474 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475 ; GFX10-NEXT: v_mov_b32_e32 v9, v1
476 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
477 ; GFX10-NEXT: s_clause 0x1
478 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
479 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
480 ; GFX10-NEXT: s_waitcnt vmcnt(0)
481 ; GFX10-NEXT: s_setpc_b64 s[30:31]
483 ; GFX11-LABEL: v_load_global_v16bf16:
485 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
487 ; GFX11-NEXT: s_clause 0x1
488 ; GFX11-NEXT: global_load_b128 v[0:3], v[4:5], off
489 ; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
490 ; GFX11-NEXT: s_waitcnt vmcnt(0)
491 ; GFX11-NEXT: s_setpc_b64 s[30:31]
492 %load = load <16 x bfloat>, ptr addrspace(1) %ptr
493 ret <16 x bfloat> %load
496 define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
497 ; GCN-LABEL: v_load_global_v32bf16:
499 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500 ; GCN-NEXT: s_mov_b32 s6, 0
501 ; GCN-NEXT: s_mov_b32 s7, 0xf000
502 ; GCN-NEXT: s_mov_b32 s4, s6
503 ; GCN-NEXT: s_mov_b32 s5, s6
504 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
505 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
506 ; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
507 ; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
508 ; GCN-NEXT: s_waitcnt vmcnt(3)
509 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
510 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
511 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
512 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
513 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
514 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
515 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
516 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
517 ; GCN-NEXT: s_waitcnt vmcnt(2)
518 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
519 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
520 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
521 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
522 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
523 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
524 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
525 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
526 ; GCN-NEXT: s_waitcnt vmcnt(1)
527 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
528 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
529 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
530 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
531 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
532 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
533 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
534 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
535 ; GCN-NEXT: s_waitcnt vmcnt(0)
536 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
537 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
538 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
539 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
540 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
541 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
542 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
543 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
544 ; GCN-NEXT: s_setpc_b64 s[30:31]
546 ; GFX7-LABEL: v_load_global_v32bf16:
548 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549 ; GFX7-NEXT: s_mov_b32 s6, 0
550 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
551 ; GFX7-NEXT: s_mov_b32 s4, s6
552 ; GFX7-NEXT: s_mov_b32 s5, s6
553 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
554 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
555 ; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
556 ; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
557 ; GFX7-NEXT: s_waitcnt vmcnt(3)
558 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
559 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
560 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
561 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
562 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
563 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
564 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
565 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
566 ; GFX7-NEXT: s_waitcnt vmcnt(2)
567 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
568 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
569 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
570 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
571 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
572 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
573 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
574 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
575 ; GFX7-NEXT: s_waitcnt vmcnt(1)
576 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
577 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
578 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
579 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
580 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
581 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
582 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
583 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
584 ; GFX7-NEXT: s_waitcnt vmcnt(0)
585 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
586 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
587 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
588 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
589 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
590 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
591 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
592 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
593 ; GFX7-NEXT: s_setpc_b64 s[30:31]
595 ; GFX8-LABEL: v_load_global_v32bf16:
597 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598 ; GFX8-NEXT: v_mov_b32_e32 v12, v0
599 ; GFX8-NEXT: v_mov_b32_e32 v13, v1
600 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v12
601 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v13, vcc
602 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v12
603 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
604 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[12:13]
605 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v12
606 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
607 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
608 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
609 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
610 ; GFX8-NEXT: s_waitcnt vmcnt(0)
611 ; GFX8-NEXT: s_setpc_b64 s[30:31]
613 ; GFX9-LABEL: v_load_global_v32bf16:
615 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616 ; GFX9-NEXT: v_mov_b32_e32 v17, v1
617 ; GFX9-NEXT: v_mov_b32_e32 v16, v0
618 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
619 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
620 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
621 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
622 ; GFX9-NEXT: s_waitcnt vmcnt(0)
623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
625 ; GFX10-LABEL: v_load_global_v32bf16:
627 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; GFX10-NEXT: v_mov_b32_e32 v17, v1
629 ; GFX10-NEXT: v_mov_b32_e32 v16, v0
630 ; GFX10-NEXT: s_clause 0x3
631 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
632 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
633 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
634 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
635 ; GFX10-NEXT: s_waitcnt vmcnt(0)
636 ; GFX10-NEXT: s_setpc_b64 s[30:31]
638 ; GFX11-LABEL: v_load_global_v32bf16:
640 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641 ; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
642 ; GFX11-NEXT: s_clause 0x3
643 ; GFX11-NEXT: global_load_b128 v[0:3], v[12:13], off
644 ; GFX11-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
645 ; GFX11-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
646 ; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
647 ; GFX11-NEXT: s_waitcnt vmcnt(0)
648 ; GFX11-NEXT: s_setpc_b64 s[30:31]
649 %load = load <32 x bfloat>, ptr addrspace(1) %ptr
650 ret <32 x bfloat> %load
653 define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
654 ; GCN-LABEL: v_load_global_v64bf16:
656 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657 ; GCN-NEXT: s_mov_b32 s7, 0xf000
658 ; GCN-NEXT: s_mov_b32 s6, 0
659 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
660 ; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
661 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
662 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
663 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
664 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
665 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
666 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
667 ; GCN-NEXT: s_mov_b32 s4, s6
668 ; GCN-NEXT: s_mov_b32 s5, s6
669 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
670 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
671 ; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0
672 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
673 ; GCN-NEXT: s_waitcnt vmcnt(0)
674 ; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
675 ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
676 ; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
677 ; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
678 ; GCN-NEXT: s_waitcnt expcnt(0)
679 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
680 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
681 ; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
682 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
683 ; GCN-NEXT: s_waitcnt vmcnt(0)
684 ; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
685 ; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
686 ; GCN-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen
687 ; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
688 ; GCN-NEXT: s_waitcnt expcnt(0)
689 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
690 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
691 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0
692 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
693 ; GCN-NEXT: s_waitcnt vmcnt(0)
694 ; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
695 ; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen
696 ; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
697 ; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
698 ; GCN-NEXT: s_waitcnt expcnt(0)
699 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
700 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
701 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
702 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0
703 ; GCN-NEXT: s_waitcnt vmcnt(0)
704 ; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
705 ; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
706 ; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen
707 ; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen
708 ; GCN-NEXT: s_waitcnt expcnt(0)
709 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
710 ; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
711 ; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
712 ; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
713 ; GCN-NEXT: s_waitcnt vmcnt(3)
714 ; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
715 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0
716 ; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
717 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
718 ; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
719 ; GCN-NEXT: s_waitcnt expcnt(0)
720 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0
721 ; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen
722 ; GCN-NEXT: s_waitcnt expcnt(0)
723 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0
724 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0
725 ; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0
726 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
727 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0
728 ; GCN-NEXT: s_waitcnt vmcnt(6)
729 ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
730 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0
731 ; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
732 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
733 ; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen
734 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0
735 ; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
736 ; GCN-NEXT: s_waitcnt vmcnt(8)
737 ; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
738 ; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen
739 ; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
740 ; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen
741 ; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen
742 ; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
743 ; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
744 ; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
745 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
746 ; GCN-NEXT: s_setpc_b64 s[30:31]
748 ; GFX7-LABEL: v_load_global_v64bf16:
750 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751 ; GFX7-NEXT: s_mov_b32 s6, 0
752 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
753 ; GFX7-NEXT: s_mov_b32 s4, s6
754 ; GFX7-NEXT: s_mov_b32 s5, s6
755 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
756 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
757 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
758 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
759 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
760 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
761 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0
762 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0
763 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0
764 ; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0
765 ; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0
766 ; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0
767 ; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0
768 ; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0
769 ; GFX7-NEXT: s_waitcnt vmcnt(0)
770 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
771 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
772 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
773 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
774 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
775 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0
776 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0
777 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x64, v0
778 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0
779 ; GFX7-NEXT: s_waitcnt vmcnt(0)
780 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
781 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
782 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
783 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
784 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
785 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
786 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
787 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0
788 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0
789 ; GFX7-NEXT: s_waitcnt vmcnt(0)
790 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
791 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
792 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
793 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
794 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
795 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0
796 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0
797 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
798 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 64, v0
799 ; GFX7-NEXT: s_waitcnt vmcnt(0)
800 ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
801 ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
802 ; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
803 ; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
804 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
805 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
806 ; GFX7-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
807 ; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
808 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0
809 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
810 ; GFX7-NEXT: s_waitcnt vmcnt(3)
811 ; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
812 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0
813 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
814 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
815 ; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
816 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0
817 ; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
818 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
819 ; GFX7-NEXT: s_waitcnt vmcnt(6)
820 ; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
821 ; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen
822 ; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
823 ; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
824 ; GFX7-NEXT: s_waitcnt vmcnt(9)
825 ; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen
826 ; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
827 ; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
828 ; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
829 ; GFX7-NEXT: s_waitcnt vmcnt(12)
830 ; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
831 ; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen
832 ; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
833 ; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
834 ; GFX7-NEXT: s_waitcnt vmcnt(0)
835 ; GFX7-NEXT: s_setpc_b64 s[30:31]
837 ; GFX8-LABEL: v_load_global_v64bf16:
839 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
840 ; GFX8-NEXT: v_mov_b32_e32 v28, v0
841 ; GFX8-NEXT: v_mov_b32_e32 v29, v1
842 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v28
843 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v29, vcc
844 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v28
845 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v29, vcc
846 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v28
847 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v29, vcc
848 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v28
849 ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc
850 ; GFX8-NEXT: s_movk_i32 s4, 0x50
851 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v28
852 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v29, vcc
853 ; GFX8-NEXT: s_movk_i32 s4, 0x60
854 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v28
855 ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
856 ; GFX8-NEXT: s_movk_i32 s4, 0x70
857 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
858 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
859 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
860 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
861 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
862 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
863 ; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
864 ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
865 ; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
866 ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
867 ; GFX8-NEXT: s_waitcnt vmcnt(0)
868 ; GFX8-NEXT: s_setpc_b64 s[30:31]
870 ; GFX9-LABEL: v_load_global_v64bf16:
872 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873 ; GFX9-NEXT: v_mov_b32_e32 v29, v1
874 ; GFX9-NEXT: v_mov_b32_e32 v28, v0
875 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
876 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
877 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
878 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
879 ; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
880 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
881 ; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
883 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
884 ; GFX9-NEXT: s_waitcnt vmcnt(0)
885 ; GFX9-NEXT: s_setpc_b64 s[30:31]
887 ; GFX10-LABEL: v_load_global_v64bf16:
889 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
890 ; GFX10-NEXT: v_mov_b32_e32 v33, v1
891 ; GFX10-NEXT: v_mov_b32_e32 v32, v0
892 ; GFX10-NEXT: s_clause 0x7
893 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[32:33], off
894 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[32:33], off offset:16
895 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[32:33], off offset:32
896 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[32:33], off offset:48
897 ; GFX10-NEXT: global_load_dwordx4 v[16:19], v[32:33], off offset:64
898 ; GFX10-NEXT: global_load_dwordx4 v[20:23], v[32:33], off offset:80
899 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v[32:33], off offset:96
900 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v[32:33], off offset:112
901 ; GFX10-NEXT: s_waitcnt vmcnt(0)
902 ; GFX10-NEXT: s_setpc_b64 s[30:31]
904 ; GFX11-LABEL: v_load_global_v64bf16:
906 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907 ; GFX11-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
908 ; GFX11-NEXT: s_clause 0x7
909 ; GFX11-NEXT: global_load_b128 v[0:3], v[28:29], off
910 ; GFX11-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
911 ; GFX11-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
912 ; GFX11-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
913 ; GFX11-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
914 ; GFX11-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
915 ; GFX11-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
916 ; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
917 ; GFX11-NEXT: s_waitcnt vmcnt(0)
918 ; GFX11-NEXT: s_setpc_b64 s[30:31]
919 %load = load <64 x bfloat>, ptr addrspace(1) %ptr
920 ret <64 x bfloat> %load
923 define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
924 ; GCN-LABEL: v_store_global_v2bf16:
926 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
928 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
929 ; GCN-NEXT: s_mov_b32 s6, 0
930 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
931 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
932 ; GCN-NEXT: s_mov_b32 s7, 0xf000
933 ; GCN-NEXT: s_mov_b32 s4, s6
934 ; GCN-NEXT: s_mov_b32 s5, s6
935 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
936 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
937 ; GCN-NEXT: s_setpc_b64 s[30:31]
939 ; GFX7-LABEL: v_store_global_v2bf16:
941 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
942 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
943 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
944 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
945 ; GFX7-NEXT: s_mov_b32 s6, 0
946 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
947 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
948 ; GFX7-NEXT: s_mov_b32 s4, s6
949 ; GFX7-NEXT: s_mov_b32 s5, s6
950 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
951 ; GFX7-NEXT: s_waitcnt vmcnt(0)
952 ; GFX7-NEXT: s_setpc_b64 s[30:31]
954 ; GFX8-LABEL: v_store_global_v2bf16:
956 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
957 ; GFX8-NEXT: flat_store_dword v[1:2], v0
958 ; GFX8-NEXT: s_waitcnt vmcnt(0)
959 ; GFX8-NEXT: s_setpc_b64 s[30:31]
961 ; GFX9-LABEL: v_store_global_v2bf16:
963 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
964 ; GFX9-NEXT: global_store_dword v[1:2], v0, off
965 ; GFX9-NEXT: s_waitcnt vmcnt(0)
966 ; GFX9-NEXT: s_setpc_b64 s[30:31]
968 ; GFX10-LABEL: v_store_global_v2bf16:
970 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
971 ; GFX10-NEXT: global_store_dword v[1:2], v0, off
972 ; GFX10-NEXT: s_setpc_b64 s[30:31]
974 ; GFX11-LABEL: v_store_global_v2bf16:
976 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977 ; GFX11-NEXT: global_store_b32 v[1:2], v0, off
978 ; GFX11-NEXT: s_setpc_b64 s[30:31]
979 store <2 x bfloat> %val, ptr addrspace(1) %ptr
983 define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
984 ; GCN-LABEL: v_store_global_v3bf16:
986 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
987 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
988 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
989 ; GCN-NEXT: s_mov_b32 s7, 0xf000
990 ; GCN-NEXT: s_mov_b32 s6, 0
991 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
992 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
993 ; GCN-NEXT: s_mov_b32 s4, s6
994 ; GCN-NEXT: s_mov_b32 s5, s6
995 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
996 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
997 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
998 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
999 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1000 ; GCN-NEXT: s_setpc_b64 s[30:31]
1002 ; GFX7-LABEL: v_store_global_v3bf16:
1004 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1005 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1006 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1007 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1008 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
1009 ; GFX7-NEXT: s_mov_b32 s6, 0
1010 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
1011 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1012 ; GFX7-NEXT: s_mov_b32 s4, s6
1013 ; GFX7-NEXT: s_mov_b32 s5, s6
1014 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1015 ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
1016 ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
1017 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1018 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1020 ; GFX8-LABEL: v_store_global_v3bf16:
1022 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023 ; GFX8-NEXT: flat_store_dword v[2:3], v0
1024 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
1025 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1026 ; GFX8-NEXT: flat_store_short v[2:3], v1
1027 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1028 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1030 ; GFX9-LABEL: v_store_global_v3bf16:
1032 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033 ; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
1034 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
1035 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1036 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1038 ; GFX10-LABEL: v_store_global_v3bf16:
1040 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1041 ; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
1042 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
1043 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1045 ; GFX11-LABEL: v_store_global_v3bf16:
1047 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1048 ; GFX11-NEXT: s_clause 0x1
1049 ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
1050 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
1051 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1052 store <3 x bfloat> %val, ptr addrspace(1) %ptr
1056 define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
1057 ; GCN-LABEL: v_store_global_v4bf16:
1059 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1061 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
1062 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1063 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1064 ; GCN-NEXT: s_mov_b32 s6, 0
1065 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1066 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1067 ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
1068 ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
1069 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1070 ; GCN-NEXT: s_mov_b32 s4, s6
1071 ; GCN-NEXT: s_mov_b32 s5, s6
1072 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
1073 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1074 ; GCN-NEXT: s_setpc_b64 s[30:31]
1076 ; GFX7-LABEL: v_store_global_v4bf16:
1078 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1079 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1080 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1081 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1082 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1083 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1084 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1085 ; GFX7-NEXT: s_mov_b32 s6, 0
1086 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
1087 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
1088 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1089 ; GFX7-NEXT: s_mov_b32 s4, s6
1090 ; GFX7-NEXT: s_mov_b32 s5, s6
1091 ; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
1092 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1095 ; GFX8-LABEL: v_store_global_v4bf16:
1097 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1098 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1099 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1100 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1102 ; GFX9-LABEL: v_store_global_v4bf16:
1104 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1105 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1107 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1109 ; GFX10-LABEL: v_store_global_v4bf16:
1111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1113 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1115 ; GFX11-LABEL: v_store_global_v4bf16:
1117 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1118 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
1119 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1120 store <4 x bfloat> %val, ptr addrspace(1) %ptr
1124 define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
1125 ; GCN-LABEL: v_store_global_v8bf16:
1127 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1129 ; GCN-NEXT: s_mov_b32 s6, 0
1130 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1131 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1132 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1133 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1134 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1135 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
1136 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1137 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1138 ; GCN-NEXT: s_mov_b32 s4, s6
1139 ; GCN-NEXT: s_mov_b32 s5, s6
1140 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
1141 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1142 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
1143 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
1144 ; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
1145 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
1146 ; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
1147 ; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
1148 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1149 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1150 ; GCN-NEXT: s_setpc_b64 s[30:31]
1152 ; GFX7-LABEL: v_store_global_v8bf16:
1154 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1155 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1156 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1157 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1158 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1159 ; GFX7-NEXT: s_mov_b32 s6, 0
1160 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1161 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1162 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1163 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1164 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1165 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1166 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1167 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1168 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1169 ; GFX7-NEXT: s_mov_b32 s4, s6
1170 ; GFX7-NEXT: s_mov_b32 s5, s6
1171 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
1172 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
1173 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
1174 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
1175 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
1176 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1177 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1179 ; GFX8-LABEL: v_store_global_v8bf16:
1181 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1182 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1183 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1184 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1186 ; GFX9-LABEL: v_store_global_v8bf16:
1188 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1189 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
1190 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1191 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1193 ; GFX10-LABEL: v_store_global_v8bf16:
1195 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
1197 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1199 ; GFX11-LABEL: v_store_global_v8bf16:
1201 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1202 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
1203 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1204 store <8 x bfloat> %val, ptr addrspace(1) %ptr
1208 define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
1209 ; GCN-LABEL: v_store_global_v16bf16:
1211 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1212 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1213 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1214 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1215 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1216 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1217 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
1218 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1219 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1220 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1221 ; GCN-NEXT: s_mov_b32 s6, 0
1222 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
1223 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1224 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
1225 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1226 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
1227 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1228 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1229 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1230 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1231 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1232 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1233 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
1234 ; GCN-NEXT: s_mov_b32 s4, s6
1235 ; GCN-NEXT: s_mov_b32 s5, s6
1236 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
1237 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
1238 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
1239 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
1240 ; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
1241 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
1242 ; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
1243 ; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
1244 ; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
1245 ; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
1246 ; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
1247 ; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
1248 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
1249 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
1250 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1251 ; GCN-NEXT: s_setpc_b64 s[30:31]
1253 ; GFX7-LABEL: v_store_global_v16bf16:
1255 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1256 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1257 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1258 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1259 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1260 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1261 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1262 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1263 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1264 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1265 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
1266 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
1267 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
1268 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
1269 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1270 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
1271 ; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
1272 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
1273 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1274 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
1275 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
1276 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
1277 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1278 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
1279 ; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
1280 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
1281 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1282 ; GFX7-NEXT: s_mov_b32 s6, 0
1283 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1284 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
1285 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1286 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1287 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1288 ; GFX7-NEXT: s_mov_b32 s4, s6
1289 ; GFX7-NEXT: s_mov_b32 s5, s6
1290 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
1291 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
1292 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
1293 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
1294 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1295 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1297 ; GFX8-LABEL: v_store_global_v16bf16:
1299 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1300 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1301 ; GFX8-NEXT: s_nop 0
1302 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
1303 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
1304 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
1305 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1306 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1308 ; GFX9-LABEL: v_store_global_v16bf16:
1310 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
1312 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
1313 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1314 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1316 ; GFX10-LABEL: v_store_global_v16bf16:
1318 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1319 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
1320 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
1321 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1323 ; GFX11-LABEL: v_store_global_v16bf16:
1325 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1326 ; GFX11-NEXT: s_clause 0x1
1327 ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
1328 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
1329 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1330 store <16 x bfloat> %val, ptr addrspace(1) %ptr
1334 define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
1335 ; GCN-LABEL: v_store_global_v32bf16:
1337 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1338 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1339 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1340 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1341 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1342 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1343 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5
1344 ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
1345 ; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16
1346 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
1347 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
1348 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1349 ; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
1350 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
1351 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1352 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1353 ; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
1354 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15
1355 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14
1356 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13
1357 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
1358 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1359 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1360 ; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16
1361 ; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16
1362 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11
1363 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10
1364 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1365 ; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16
1366 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9
1367 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8
1368 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1369 ; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16
1370 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23
1371 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22
1372 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
1373 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20
1374 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1375 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1376 ; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16
1377 ; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16
1378 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19
1379 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
1380 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1381 ; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16
1382 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17
1383 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16
1384 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1385 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29
1386 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28
1387 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27
1388 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
1389 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1390 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6
1391 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
1392 ; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16
1393 ; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16
1394 ; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16
1395 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
1396 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25
1397 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24
1398 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1399 ; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16
1400 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
1401 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
1402 ; GCN-NEXT: s_mov_b32 s6, 0
1403 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30
1404 ; GCN-NEXT: s_mov_b32 s4, s6
1405 ; GCN-NEXT: s_mov_b32 s5, s6
1406 ; GCN-NEXT: s_waitcnt vmcnt(1)
1407 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
1408 ; GCN-NEXT: s_waitcnt vmcnt(0)
1409 ; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
1410 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
1411 ; GCN-NEXT: s_waitcnt expcnt(1)
1412 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17
1413 ; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16
1414 ; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
1415 ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
1416 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1417 ; GCN-NEXT: s_setpc_b64 s[30:31]
1419 ; GFX7-LABEL: v_store_global_v32bf16:
1421 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1423 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1424 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1425 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1426 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1427 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1428 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
1429 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
1430 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
1431 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32
1432 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1433 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
1434 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1435 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1436 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1437 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1438 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5
1439 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
1440 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
1441 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
1442 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
1443 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1444 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
1445 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
1446 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
1447 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
1448 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1449 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
1450 ; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
1451 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
1452 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1453 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
1454 ; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
1455 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
1456 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1457 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
1458 ; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
1459 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
1460 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1461 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
1462 ; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
1463 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
1464 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1465 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
1466 ; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
1467 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
1468 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1469 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16
1470 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16
1471 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
1472 ; GFX7-NEXT: s_mov_b32 s6, 0
1473 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
1474 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1475 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1476 ; GFX7-NEXT: s_mov_b32 s4, s6
1477 ; GFX7-NEXT: s_mov_b32 s5, s6
1478 ; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16
1479 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1480 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
1481 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1482 ; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
1483 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
1484 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1485 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
1486 ; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
1487 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
1488 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1489 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
1490 ; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
1491 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
1492 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1493 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16
1494 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1495 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
1496 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
1497 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
1498 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
1499 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1500 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1502 ; GFX8-LABEL: v_store_global_v32bf16:
1504 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1505 ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1506 ; GFX8-NEXT: s_nop 0
1507 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16
1508 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
1509 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
1510 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16
1511 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
1512 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
1513 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16
1514 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
1515 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
1516 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1517 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1519 ; GFX9-LABEL: v_store_global_v32bf16:
1521 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1522 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
1523 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
1524 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
1525 ; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
1526 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1527 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1529 ; GFX10-LABEL: v_store_global_v32bf16:
1531 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1532 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
1533 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
1534 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
1535 ; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
1536 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1538 ; GFX11-LABEL: v_store_global_v32bf16:
1540 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1541 ; GFX11-NEXT: s_clause 0x3
1542 ; GFX11-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48
1543 ; GFX11-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32
1544 ; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16
1545 ; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off
1546 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1547 store <32 x bfloat> %val, ptr addrspace(1) %ptr
1551 define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
1552 ; GCN-LABEL: v_store_global_v64bf16:
1554 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1555 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
1556 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
1557 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
1558 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
1559 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
1560 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
1561 ; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
1562 ; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
1563 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
1564 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
1565 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
1566 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
1567 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17
1568 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16
1569 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136
1570 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132
1571 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
1572 ; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
1573 ; GCN-NEXT: s_mov_b32 s6, 0
1574 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1575 ; GCN-NEXT: s_mov_b32 s4, s6
1576 ; GCN-NEXT: s_mov_b32 s5, s6
1577 ; GCN-NEXT: s_waitcnt vmcnt(0)
1578 ; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32
1579 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
1580 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
1581 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
1582 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1583 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
1584 ; GCN-NEXT: s_waitcnt expcnt(0)
1585 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13
1586 ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
1587 ; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16
1588 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
1589 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1590 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
1591 ; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
1592 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1593 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1594 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
1595 ; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
1596 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
1597 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128
1598 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124
1599 ; GCN-NEXT: s_waitcnt expcnt(0)
1600 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120
1601 ; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116
1602 ; GCN-NEXT: s_waitcnt vmcnt(3)
1603 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1604 ; GCN-NEXT: s_waitcnt vmcnt(2)
1605 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1606 ; GCN-NEXT: s_waitcnt vmcnt(1)
1607 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1608 ; GCN-NEXT: s_waitcnt vmcnt(0)
1609 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
1610 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1611 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
1612 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112
1613 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108
1614 ; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
1615 ; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
1616 ; GCN-NEXT: s_waitcnt vmcnt(1)
1617 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
1618 ; GCN-NEXT: s_waitcnt vmcnt(0)
1619 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
1620 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104
1621 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100
1622 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1623 ; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
1624 ; GCN-NEXT: s_waitcnt vmcnt(1)
1625 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
1626 ; GCN-NEXT: s_waitcnt vmcnt(0)
1627 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
1628 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1629 ; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
1630 ; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112
1631 ; GCN-NEXT: s_waitcnt expcnt(0)
1632 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96
1633 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92
1634 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
1635 ; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84
1636 ; GCN-NEXT: s_waitcnt vmcnt(3)
1637 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1638 ; GCN-NEXT: s_waitcnt vmcnt(2)
1639 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1640 ; GCN-NEXT: s_waitcnt vmcnt(1)
1641 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
1642 ; GCN-NEXT: s_waitcnt vmcnt(0)
1643 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
1644 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1645 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
1646 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80
1647 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76
1648 ; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
1649 ; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
1650 ; GCN-NEXT: s_waitcnt vmcnt(1)
1651 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
1652 ; GCN-NEXT: s_waitcnt vmcnt(0)
1653 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
1654 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72
1655 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68
1656 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1657 ; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
1658 ; GCN-NEXT: s_waitcnt vmcnt(1)
1659 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
1660 ; GCN-NEXT: s_waitcnt vmcnt(0)
1661 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
1662 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1663 ; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
1664 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
1665 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
1666 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
1667 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
1668 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3
1669 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2
1670 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1
1671 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
1672 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30
1673 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29
1674 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28
1675 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27
1676 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26
1677 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
1678 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
1679 ; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96
1680 ; GCN-NEXT: s_waitcnt expcnt(0)
1681 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32
1682 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32
1683 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7
1684 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5
1685 ; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16
1686 ; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16
1687 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28
1688 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
1689 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12
1690 ; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16
1691 ; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20
1692 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16
1693 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14
1694 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18
1695 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20
1696 ; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16
1697 ; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16
1698 ; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16
1699 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
1700 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8
1701 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22
1702 ; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
1703 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
1704 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
1705 ; GCN-NEXT: s_waitcnt vmcnt(9)
1706 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
1707 ; GCN-NEXT: s_waitcnt vmcnt(8)
1708 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
1709 ; GCN-NEXT: s_waitcnt vmcnt(7)
1710 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7
1711 ; GCN-NEXT: s_waitcnt vmcnt(6)
1712 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10
1713 ; GCN-NEXT: s_waitcnt vmcnt(5)
1714 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
1715 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1716 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
1717 ; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7
1718 ; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16
1719 ; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16
1720 ; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16
1721 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60
1722 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56
1723 ; GCN-NEXT: s_waitcnt vmcnt(6)
1724 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
1725 ; GCN-NEXT: s_waitcnt vmcnt(5)
1726 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13
1727 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1728 ; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
1729 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
1730 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48
1731 ; GCN-NEXT: s_waitcnt vmcnt(6)
1732 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
1733 ; GCN-NEXT: s_waitcnt vmcnt(5)
1734 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18
1735 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
1736 ; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16
1737 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
1738 ; GCN-NEXT: s_waitcnt vmcnt(5)
1739 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19
1740 ; GCN-NEXT: s_waitcnt vmcnt(4)
1741 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
1742 ; GCN-NEXT: s_waitcnt vmcnt(3)
1743 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
1744 ; GCN-NEXT: s_waitcnt vmcnt(2)
1745 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1746 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1747 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
1748 ; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16
1749 ; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16
1750 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
1751 ; GCN-NEXT: s_waitcnt vmcnt(2)
1752 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
1753 ; GCN-NEXT: s_waitcnt vmcnt(1)
1754 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
1755 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36
1756 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
1757 ; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16
1758 ; GCN-NEXT: s_waitcnt vmcnt(1)
1759 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
1760 ; GCN-NEXT: s_waitcnt vmcnt(0)
1761 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
1762 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
1763 ; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16
1764 ; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80
1765 ; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64
1766 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48
1767 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
1768 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1769 ; GCN-NEXT: s_setpc_b64 s[30:31]
1771 ; GFX7-LABEL: v_store_global_v64bf16:
1773 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1774 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
1775 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
1776 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
1777 ; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116
1778 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112
1779 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108
1780 ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104
1781 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100
1782 ; GFX7-NEXT: s_mov_b32 s6, 0
1783 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
1784 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1785 ; GFX7-NEXT: s_mov_b32 s4, s6
1786 ; GFX7-NEXT: s_mov_b32 s5, s6
1787 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1788 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
1789 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1790 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
1791 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1792 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
1793 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1794 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1795 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1796 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
1797 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
1798 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
1799 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
1800 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1801 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
1802 ; GFX7-NEXT: s_waitcnt vmcnt(7)
1803 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
1804 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1805 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
1806 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
1807 ; GFX7-NEXT: s_waitcnt vmcnt(5)
1808 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
1809 ; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16
1810 ; GFX7-NEXT: s_waitcnt vmcnt(3)
1811 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37
1812 ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34
1813 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
1814 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1815 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38
1816 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
1817 ; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16
1818 ; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16
1819 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1820 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39
1821 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1822 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48
1823 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
1824 ; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16
1825 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
1826 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
1827 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96
1828 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92
1829 ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
1830 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84
1831 ; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80
1832 ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
1833 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1834 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
1835 ; GFX7-NEXT: s_nop 0
1836 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
1837 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
1838 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
1839 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
1840 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1841 ; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
1842 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72
1843 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
1844 ; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32
1845 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
1846 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
1847 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1848 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
1849 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
1850 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
1851 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1852 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
1853 ; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
1854 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
1855 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1856 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
1857 ; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
1858 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21
1859 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1860 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20
1861 ; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16
1862 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
1863 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1864 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
1865 ; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
1866 ; GFX7-NEXT: s_waitcnt vmcnt(9)
1867 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37
1868 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28
1869 ; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64
1870 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1871 ; GFX7-NEXT: s_waitcnt vmcnt(9)
1872 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38
1873 ; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
1874 ; GFX7-NEXT: s_waitcnt vmcnt(8)
1875 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39
1876 ; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16
1877 ; GFX7-NEXT: s_waitcnt vmcnt(6)
1878 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49
1879 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
1880 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48
1881 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1882 ; GFX7-NEXT: s_waitcnt vmcnt(5)
1883 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50
1884 ; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16
1885 ; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16
1886 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
1887 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28
1888 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24
1889 ; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
1890 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16
1891 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
1892 ; GFX7-NEXT: s_waitcnt vmcnt(8)
1893 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
1894 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
1895 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1896 ; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16
1897 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
1898 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1899 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
1900 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
1901 ; GFX7-NEXT: s_waitcnt vmcnt(7)
1902 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15
1903 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1904 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
1905 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
1906 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29
1907 ; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
1908 ; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52
1909 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
1910 ; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48
1911 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
1912 ; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
1913 ; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8
1914 ; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
1915 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
1916 ; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56
1917 ; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40
1918 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
1919 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1920 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
1921 ; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
1922 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
1923 ; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16
1924 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
1925 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24
1926 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16
1927 ; GFX7-NEXT: s_waitcnt vmcnt(14)
1928 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
1929 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1930 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
1931 ; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16
1932 ; GFX7-NEXT: s_waitcnt vmcnt(13)
1933 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18
1934 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1935 ; GFX7-NEXT: s_waitcnt vmcnt(12)
1936 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19
1937 ; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16
1938 ; GFX7-NEXT: s_waitcnt vmcnt(11)
1939 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22
1940 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1941 ; GFX7-NEXT: s_waitcnt vmcnt(10)
1942 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23
1943 ; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16
1944 ; GFX7-NEXT: s_waitcnt vmcnt(8)
1945 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35
1946 ; GFX7-NEXT: s_waitcnt vmcnt(5)
1947 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29
1948 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1949 ; GFX7-NEXT: s_waitcnt vmcnt(4)
1950 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
1951 ; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16
1952 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28
1953 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1954 ; GFX7-NEXT: s_waitcnt vmcnt(3)
1955 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33
1956 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1957 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34
1958 ; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16
1959 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
1960 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
1961 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1962 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
1963 ; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16
1964 ; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16
1965 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1966 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36
1967 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1968 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1969 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37
1970 ; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16
1971 ; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80
1972 ; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64
1973 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
1974 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
1975 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
1976 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64
1977 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1978 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1980 ; GFX8-LABEL: v_store_global_v64bf16:
1982 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1983 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
1984 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
1985 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
1986 ; GFX8-NEXT: s_movk_i32 s4, 0x70
1987 ; GFX8-NEXT: s_movk_i32 s5, 0x50
1988 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1989 ; GFX8-NEXT: v_add_u32_e32 v34, vcc, s4, v32
1990 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1991 ; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc
1992 ; GFX8-NEXT: s_movk_i32 s4, 0x60
1993 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1994 ; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[28:31]
1995 ; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
1996 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v32
1997 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
1998 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v32
1999 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
2000 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v32
2001 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v33, vcc
2002 ; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
2003 ; GFX8-NEXT: s_nop 0
2004 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, 48, v32
2005 ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v33, vcc
2006 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, 32, v32
2007 ; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v33, vcc
2008 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, 16, v32
2009 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
2010 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
2011 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
2012 ; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
2013 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
2014 ; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7]
2015 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2016 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2018 ; GFX9-LABEL: v_store_global_v64bf16:
2020 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2021 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
2022 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
2023 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
2024 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2025 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
2026 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
2027 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
2028 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
2029 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
2030 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
2031 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
2032 ; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
2033 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2034 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2036 ; GFX10-LABEL: v_store_global_v64bf16:
2038 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2039 ; GFX10-NEXT: s_clause 0x2
2040 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
2041 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
2042 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
2043 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2044 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
2045 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
2046 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
2047 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
2048 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
2049 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
2050 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
2051 ; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
2052 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2054 ; GFX11-LABEL: v_store_global_v64bf16:
2056 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2057 ; GFX11-NEXT: s_clause 0x2
2058 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
2059 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
2060 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
2061 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2062 ; GFX11-NEXT: s_clause 0x7
2063 ; GFX11-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112
2064 ; GFX11-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96
2065 ; GFX11-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80
2066 ; GFX11-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64
2067 ; GFX11-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48
2068 ; GFX11-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32
2069 ; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16
2070 ; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off
2071 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2072 store <64 x bfloat> %val, ptr addrspace(1) %ptr
2076 define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
2077 ; GCN-LABEL: test_store_fpimm:
2079 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2080 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2081 ; GCN-NEXT: s_mov_b32 s6, 0
2082 ; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80
2083 ; GCN-NEXT: v_mov_b32_e32 v5, 0x4228
2084 ; GCN-NEXT: s_mov_b32 s4, s6
2085 ; GCN-NEXT: s_mov_b32 s5, s6
2086 ; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
2087 ; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64
2088 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2089 ; GCN-NEXT: s_setpc_b64 s[30:31]
2091 ; GFX7-LABEL: test_store_fpimm:
2093 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2094 ; GFX7-NEXT: s_mov_b32 s6, 0
2095 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2096 ; GFX7-NEXT: s_mov_b32 s4, s6
2097 ; GFX7-NEXT: s_mov_b32 s5, s6
2098 ; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80
2099 ; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
2100 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228
2101 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2102 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2103 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2105 ; GFX8-LABEL: test_store_fpimm:
2107 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2108 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80
2109 ; GFX8-NEXT: flat_store_short v[0:1], v4
2110 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228
2111 ; GFX8-NEXT: flat_store_short v[2:3], v0
2112 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2113 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2115 ; GFX9-LABEL: test_store_fpimm:
2117 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2118 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80
2119 ; GFX9-NEXT: global_store_short v[0:1], v4, off
2120 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228
2121 ; GFX9-NEXT: global_store_short v[2:3], v0, off
2122 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2123 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2125 ; GFX10-LABEL: test_store_fpimm:
2127 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80
2129 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228
2130 ; GFX10-NEXT: global_store_short v[0:1], v4, off
2131 ; GFX10-NEXT: global_store_short v[2:3], v5, off
2132 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2134 ; GFX11-LABEL: test_store_fpimm:
2136 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2137 ; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80
2138 ; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228
2139 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off
2140 ; GFX11-NEXT: global_store_b16 v[2:3], v5, off
2141 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2142 store bfloat 1.0, ptr addrspace(1) %ptr0
2143 store bfloat 42.0, ptr addrspace(1) %ptr1
2147 define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2148 ; GCN-LABEL: test_load_store_f32_to_bf16:
2150 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2151 ; GCN-NEXT: s_mov_b32 s6, 0
2152 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2153 ; GCN-NEXT: s_mov_b32 s4, s6
2154 ; GCN-NEXT: s_mov_b32 s5, s6
2155 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2156 ; GCN-NEXT: s_waitcnt vmcnt(0)
2157 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2158 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2159 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2160 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2161 ; GCN-NEXT: s_setpc_b64 s[30:31]
2163 ; GFX7-LABEL: test_load_store_f32_to_bf16:
2165 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2166 ; GFX7-NEXT: s_mov_b32 s6, 0
2167 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2168 ; GFX7-NEXT: s_mov_b32 s4, s6
2169 ; GFX7-NEXT: s_mov_b32 s5, s6
2170 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2171 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2172 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2173 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2174 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2175 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2176 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2178 ; GFX8-LABEL: test_load_store_f32_to_bf16:
2180 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2181 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2182 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2183 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
2184 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
2185 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
2186 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
2187 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
2188 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
2189 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2190 ; GFX8-NEXT: flat_store_short v[2:3], v0
2191 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2192 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2194 ; GFX9-LABEL: test_load_store_f32_to_bf16:
2196 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2197 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2198 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
2199 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2200 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
2201 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
2202 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
2203 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
2204 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
2205 ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
2206 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2207 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2209 ; GFX10-LABEL: test_load_store_f32_to_bf16:
2211 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2212 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2213 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2214 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
2215 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
2216 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
2217 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
2218 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
2219 ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
2220 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2222 ; GFX11-LABEL: test_load_store_f32_to_bf16:
2224 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2225 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2226 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2227 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
2228 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
2229 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
2230 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2231 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
2232 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
2233 ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
2234 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2235 %val = load float, ptr addrspace(1) %in
2236 %val.bf16 = fptrunc float %val to bfloat
2237 store bfloat %val.bf16, ptr addrspace(1) %out
2241 define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2242 ; GCN-LABEL: test_load_store_f64_to_bf16:
2244 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2245 ; GCN-NEXT: s_mov_b32 s6, 0
2246 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2247 ; GCN-NEXT: s_mov_b32 s4, s6
2248 ; GCN-NEXT: s_mov_b32 s5, s6
2249 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2250 ; GCN-NEXT: s_waitcnt vmcnt(0)
2251 ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2252 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2253 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2254 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2255 ; GCN-NEXT: s_setpc_b64 s[30:31]
2257 ; GFX7-LABEL: test_load_store_f64_to_bf16:
2259 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2260 ; GFX7-NEXT: s_mov_b32 s6, 0
2261 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2262 ; GFX7-NEXT: s_mov_b32 s4, s6
2263 ; GFX7-NEXT: s_mov_b32 s5, s6
2264 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2265 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2266 ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2267 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2268 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
2269 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2270 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2272 ; GFX8-LABEL: test_load_store_f64_to_bf16:
2274 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2275 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2276 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2277 ; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2278 ; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1
2279 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2280 ; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
2281 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
2282 ; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2283 ; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2284 ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5]
2285 ; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4
2286 ; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc
2287 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2288 ; GFX8-NEXT: v_or_b32_e32 v5, v4, v7
2289 ; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
2290 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
2291 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
2292 ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2293 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5
2294 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
2295 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2296 ; GFX8-NEXT: flat_store_short v[2:3], v0
2297 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2298 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2300 ; GFX9-LABEL: test_load_store_f64_to_bf16:
2302 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2303 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2304 ; GFX9-NEXT: s_brev_b32 s8, 1
2305 ; GFX9-NEXT: s_movk_i32 s9, 0x7fff
2306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2307 ; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2308 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2309 ; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
2310 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
2311 ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2312 ; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2313 ; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2314 ; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
2315 ; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc
2316 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2317 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2318 ; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
2319 ; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
2320 ; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
2321 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5
2322 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
2323 ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
2324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2325 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2327 ; GFX10-LABEL: test_load_store_f64_to_bf16:
2329 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2330 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2331 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2332 ; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2333 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2334 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v6
2335 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
2336 ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
2337 ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
2338 ; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
2339 ; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo
2340 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
2341 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2342 ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2343 ; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
2344 ; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
2345 ; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
2346 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
2347 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2348 ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
2349 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2351 ; GFX11-LABEL: test_load_store_f64_to_bf16:
2353 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2354 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2355 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2356 ; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2357 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2358 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2359 ; GFX11-NEXT: v_and_b32_e32 v7, 1, v6
2360 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
2361 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2362 ; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
2363 ; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
2364 ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
2365 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2366 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
2367 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
2368 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2369 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2370 ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2371 ; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
2372 ; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
2373 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2374 ; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
2375 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
2376 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2377 ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
2378 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2379 %val = load double, ptr addrspace(1) %in
2380 %val.bf16 = fptrunc double %val to bfloat
2381 store bfloat %val.bf16, ptr addrspace(1) %out
2385 define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2386 ; GCN-LABEL: test_load_store_bf16_to_f32:
2388 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2389 ; GCN-NEXT: s_mov_b32 s6, 0
2390 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2391 ; GCN-NEXT: s_mov_b32 s4, s6
2392 ; GCN-NEXT: s_mov_b32 s5, s6
2393 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2394 ; GCN-NEXT: s_waitcnt vmcnt(0)
2395 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2396 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2397 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2398 ; GCN-NEXT: s_setpc_b64 s[30:31]
2400 ; GFX7-LABEL: test_load_store_bf16_to_f32:
2402 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2403 ; GFX7-NEXT: s_mov_b32 s6, 0
2404 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2405 ; GFX7-NEXT: s_mov_b32 s4, s6
2406 ; GFX7-NEXT: s_mov_b32 s5, s6
2407 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2408 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2409 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2410 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2411 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2412 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2414 ; GFX8-LABEL: test_load_store_bf16_to_f32:
2416 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
2418 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2419 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2420 ; GFX8-NEXT: flat_store_dword v[2:3], v0
2421 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2422 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2424 ; GFX9-LABEL: test_load_store_bf16_to_f32:
2426 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2427 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
2428 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2429 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2430 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
2431 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2432 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2434 ; GFX10-LABEL: test_load_store_bf16_to_f32:
2436 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
2438 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2439 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2440 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
2441 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2443 ; GFX11-LABEL: test_load_store_bf16_to_f32:
2445 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2446 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
2447 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2448 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2449 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
2450 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2451 %val = load bfloat, ptr addrspace(1) %in
2452 %val.f32 = fpext bfloat %val to float
2453 store float %val.f32, ptr addrspace(1) %out
2457 define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2458 ; GCN-LABEL: test_load_store_bf16_to_f64:
2460 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2461 ; GCN-NEXT: s_mov_b32 s6, 0
2462 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2463 ; GCN-NEXT: s_mov_b32 s4, s6
2464 ; GCN-NEXT: s_mov_b32 s5, s6
2465 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2466 ; GCN-NEXT: s_waitcnt vmcnt(0)
2467 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2468 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2469 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2470 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2471 ; GCN-NEXT: s_setpc_b64 s[30:31]
2473 ; GFX7-LABEL: test_load_store_bf16_to_f64:
2475 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2476 ; GFX7-NEXT: s_mov_b32 s6, 0
2477 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2478 ; GFX7-NEXT: s_mov_b32 s4, s6
2479 ; GFX7-NEXT: s_mov_b32 s5, s6
2480 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
2481 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2482 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2483 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2484 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2485 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2486 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2488 ; GFX8-LABEL: test_load_store_bf16_to_f64:
2490 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2491 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
2492 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2493 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2494 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2495 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2496 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2497 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2499 ; GFX9-LABEL: test_load_store_bf16_to_f64:
2501 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2502 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
2503 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2504 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2505 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2506 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2507 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2508 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2510 ; GFX10-LABEL: test_load_store_bf16_to_f64:
2512 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2513 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
2514 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2515 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2516 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2517 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2518 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2520 ; GFX11-LABEL: test_load_store_bf16_to_f64:
2522 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2523 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
2524 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2525 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2526 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2527 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2528 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2529 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2530 %val = load bfloat, ptr addrspace(1) %in
2531 %val.f64 = fpext bfloat %val to double
2532 store double %val.f64, ptr addrspace(1) %out
2536 define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2537 ; GCN-LABEL: test_load_store_v2bf16:
2539 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2540 ; GCN-NEXT: s_mov_b32 s6, 0
2541 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2542 ; GCN-NEXT: s_mov_b32 s4, s6
2543 ; GCN-NEXT: s_mov_b32 s5, s6
2544 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2545 ; GCN-NEXT: s_waitcnt vmcnt(0)
2546 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2547 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2548 ; GCN-NEXT: s_setpc_b64 s[30:31]
2550 ; GFX7-LABEL: test_load_store_v2bf16:
2552 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2553 ; GFX7-NEXT: s_mov_b32 s6, 0
2554 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2555 ; GFX7-NEXT: s_mov_b32 s4, s6
2556 ; GFX7-NEXT: s_mov_b32 s5, s6
2557 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2558 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2559 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2560 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2561 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2563 ; GFX8-LABEL: test_load_store_v2bf16:
2565 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2566 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2567 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2568 ; GFX8-NEXT: flat_store_dword v[2:3], v0
2569 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2570 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2572 ; GFX9-LABEL: test_load_store_v2bf16:
2574 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2576 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2577 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
2578 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2579 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2581 ; GFX10-LABEL: test_load_store_v2bf16:
2583 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2584 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2585 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2586 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
2587 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2589 ; GFX11-LABEL: test_load_store_v2bf16:
2591 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2592 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2593 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2594 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
2595 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2596 %val = load <2 x bfloat>, ptr addrspace(1) %in
2597 store <2 x bfloat> %val, ptr addrspace(1) %out
2601 define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2602 ; GCN-LABEL: test_load_store_v4bf16:
2604 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2605 ; GCN-NEXT: s_mov_b32 s6, 0
2606 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2607 ; GCN-NEXT: s_mov_b32 s4, s6
2608 ; GCN-NEXT: s_mov_b32 s5, s6
2609 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2610 ; GCN-NEXT: s_waitcnt vmcnt(0)
2611 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2612 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2613 ; GCN-NEXT: s_setpc_b64 s[30:31]
2615 ; GFX7-LABEL: test_load_store_v4bf16:
2617 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2618 ; GFX7-NEXT: s_mov_b32 s6, 0
2619 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2620 ; GFX7-NEXT: s_mov_b32 s4, s6
2621 ; GFX7-NEXT: s_mov_b32 s5, s6
2622 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2623 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2624 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
2625 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2626 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2628 ; GFX8-LABEL: test_load_store_v4bf16:
2630 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2632 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2633 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2634 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2635 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2637 ; GFX9-LABEL: test_load_store_v4bf16:
2639 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2640 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2642 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2643 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2644 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2646 ; GFX10-LABEL: test_load_store_v4bf16:
2648 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2649 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2650 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2651 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2652 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2654 ; GFX11-LABEL: test_load_store_v4bf16:
2656 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2657 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2658 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2659 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2660 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2661 %val = load <4 x bfloat>, ptr addrspace(1) %in
2662 store <4 x bfloat> %val, ptr addrspace(1) %out
2666 define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2667 ; GCN-LABEL: test_load_store_v8bf16:
2669 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2670 ; GCN-NEXT: s_mov_b32 s6, 0
2671 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2672 ; GCN-NEXT: s_mov_b32 s4, s6
2673 ; GCN-NEXT: s_mov_b32 s5, s6
2674 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
2675 ; GCN-NEXT: s_waitcnt vmcnt(0)
2676 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
2677 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2678 ; GCN-NEXT: s_setpc_b64 s[30:31]
2680 ; GFX7-LABEL: test_load_store_v8bf16:
2682 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2683 ; GFX7-NEXT: s_mov_b32 s6, 0
2684 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2685 ; GFX7-NEXT: s_mov_b32 s4, s6
2686 ; GFX7-NEXT: s_mov_b32 s5, s6
2687 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
2688 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2689 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
2690 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2691 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2693 ; GFX8-LABEL: test_load_store_v8bf16:
2695 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2696 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
2697 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2698 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
2699 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2700 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2702 ; GFX9-LABEL: test_load_store_v8bf16:
2704 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2705 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
2706 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2707 ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
2708 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2709 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2711 ; GFX10-LABEL: test_load_store_v8bf16:
2713 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2714 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
2715 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2716 ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
2717 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2719 ; GFX11-LABEL: test_load_store_v8bf16:
2721 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2722 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
2723 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2724 ; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off
2725 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2726 %val = load <8 x bfloat>, ptr addrspace(1) %in
2727 store <8 x bfloat> %val, ptr addrspace(1) %out
2731 define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2732 ; GCN-LABEL: test_load_store_v16bf16:
2734 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2735 ; GCN-NEXT: s_mov_b32 s6, 0
2736 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2737 ; GCN-NEXT: s_mov_b32 s4, s6
2738 ; GCN-NEXT: s_mov_b32 s5, s6
2739 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
2740 ; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
2741 ; GCN-NEXT: s_waitcnt vmcnt(1)
2742 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
2743 ; GCN-NEXT: s_waitcnt vmcnt(1)
2744 ; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
2745 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2746 ; GCN-NEXT: s_setpc_b64 s[30:31]
2748 ; GFX7-LABEL: test_load_store_v16bf16:
2750 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2751 ; GFX7-NEXT: s_mov_b32 s6, 0
2752 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2753 ; GFX7-NEXT: s_mov_b32 s4, s6
2754 ; GFX7-NEXT: s_mov_b32 s5, s6
2755 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
2756 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
2757 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2758 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
2759 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2760 ; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
2761 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2762 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2764 ; GFX8-LABEL: test_load_store_v16bf16:
2766 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2767 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0
2768 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
2769 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
2770 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
2771 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
2772 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
2773 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2774 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
2775 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2776 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
2777 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2778 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2780 ; GFX9-LABEL: test_load_store_v16bf16:
2782 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2783 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
2784 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
2785 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2786 ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16
2787 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2788 ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off
2789 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2790 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2792 ; GFX10-LABEL: test_load_store_v16bf16:
2794 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2795 ; GFX10-NEXT: s_clause 0x1
2796 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
2797 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
2798 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2799 ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16
2800 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2801 ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off
2802 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2804 ; GFX11-LABEL: test_load_store_v16bf16:
2806 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2807 ; GFX11-NEXT: s_clause 0x1
2808 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
2809 ; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off
2810 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2811 ; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
2812 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2813 ; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off
2814 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2815 %val = load <16 x bfloat>, ptr addrspace(1) %in
2816 store <16 x bfloat> %val, ptr addrspace(1) %out
2820 define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
2821 ; GCN-LABEL: test_arg_store:
2823 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2824 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2825 ; GCN-NEXT: s_mov_b32 s6, 0
2826 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2827 ; GCN-NEXT: s_mov_b32 s4, s6
2828 ; GCN-NEXT: s_mov_b32 s5, s6
2829 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2830 ; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
2831 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2832 ; GCN-NEXT: s_setpc_b64 s[30:31]
2834 ; GFX7-LABEL: test_arg_store:
2836 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2837 ; GFX7-NEXT: s_mov_b32 s6, 0
2838 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2839 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2840 ; GFX7-NEXT: s_mov_b32 s4, s6
2841 ; GFX7-NEXT: s_mov_b32 s5, s6
2842 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2843 ; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
2844 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2845 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2847 ; GFX8-LABEL: test_arg_store:
2849 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2850 ; GFX8-NEXT: flat_store_short v[1:2], v0
2851 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2852 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2854 ; GFX9-LABEL: test_arg_store:
2856 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2857 ; GFX9-NEXT: global_store_short v[1:2], v0, off
2858 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2859 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2861 ; GFX10-LABEL: test_arg_store:
2863 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2864 ; GFX10-NEXT: global_store_short v[1:2], v0, off
2865 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2867 ; GFX11-LABEL: test_arg_store:
2869 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2870 ; GFX11-NEXT: global_store_b16 v[1:2], v0, off
2871 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2872 store bfloat %in, ptr addrspace(1) %out
2876 define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
2877 ; GCN-LABEL: test_arg_store_v2bf16:
2879 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2880 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
2881 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2882 ; GCN-NEXT: s_mov_b32 s6, 0
2883 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2884 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
2885 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2886 ; GCN-NEXT: s_mov_b32 s4, s6
2887 ; GCN-NEXT: s_mov_b32 s5, s6
2888 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2889 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2890 ; GCN-NEXT: s_setpc_b64 s[30:31]
2892 ; GFX7-LABEL: test_arg_store_v2bf16:
2894 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2895 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
2896 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2897 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2898 ; GFX7-NEXT: s_mov_b32 s6, 0
2899 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
2900 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2901 ; GFX7-NEXT: s_mov_b32 s4, s6
2902 ; GFX7-NEXT: s_mov_b32 s5, s6
2903 ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
2904 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2905 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2907 ; GFX8-LABEL: test_arg_store_v2bf16:
2909 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2910 ; GFX8-NEXT: flat_store_dword v[1:2], v0
2911 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2912 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2914 ; GFX9-LABEL: test_arg_store_v2bf16:
2916 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2917 ; GFX9-NEXT: global_store_dword v[1:2], v0, off
2918 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2919 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2921 ; GFX10-LABEL: test_arg_store_v2bf16:
2923 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2924 ; GFX10-NEXT: global_store_dword v[1:2], v0, off
2925 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2927 ; GFX11-LABEL: test_arg_store_v2bf16:
2929 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2930 ; GFX11-NEXT: global_store_b32 v[1:2], v0, off
2931 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2932 store <2 x bfloat> %in, ptr addrspace(1) %out
2936 define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
2937 ; GCN-LABEL: test_arg_store_v3bf16:
2939 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2940 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
2941 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
2942 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2943 ; GCN-NEXT: s_mov_b32 s6, 0
2944 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
2945 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2946 ; GCN-NEXT: s_mov_b32 s4, s6
2947 ; GCN-NEXT: s_mov_b32 s5, s6
2948 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2949 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
2950 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
2951 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
2952 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2953 ; GCN-NEXT: s_setpc_b64 s[30:31]
2955 ; GFX7-LABEL: test_arg_store_v3bf16:
2957 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2958 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
2959 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2960 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
2961 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
2962 ; GFX7-NEXT: s_mov_b32 s6, 0
2963 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
2964 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2965 ; GFX7-NEXT: s_mov_b32 s4, s6
2966 ; GFX7-NEXT: s_mov_b32 s5, s6
2967 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2968 ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
2969 ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
2970 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2971 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2973 ; GFX8-LABEL: test_arg_store_v3bf16:
2975 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2976 ; GFX8-NEXT: flat_store_dword v[2:3], v0
2977 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
2978 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2979 ; GFX8-NEXT: flat_store_short v[2:3], v1
2980 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2981 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2983 ; GFX9-LABEL: test_arg_store_v3bf16:
2985 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2986 ; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
2987 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
2988 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2989 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2991 ; GFX10-LABEL: test_arg_store_v3bf16:
2993 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2994 ; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
2995 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
2996 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2998 ; GFX11-LABEL: test_arg_store_v3bf16:
3000 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3001 ; GFX11-NEXT: s_clause 0x1
3002 ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
3003 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off
3004 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3005 store <3 x bfloat> %in, ptr addrspace(1) %out
3009 define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
3010 ; GCN-LABEL: test_arg_store_v4bf16:
3012 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3013 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
3014 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
3015 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3016 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3017 ; GCN-NEXT: s_mov_b32 s6, 0
3018 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3019 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
3020 ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
3021 ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
3022 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3023 ; GCN-NEXT: s_mov_b32 s4, s6
3024 ; GCN-NEXT: s_mov_b32 s5, s6
3025 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
3026 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3027 ; GCN-NEXT: s_setpc_b64 s[30:31]
3029 ; GFX7-LABEL: test_arg_store_v4bf16:
3031 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3032 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
3033 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3034 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3035 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
3036 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3037 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3038 ; GFX7-NEXT: s_mov_b32 s6, 0
3039 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
3040 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
3041 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3042 ; GFX7-NEXT: s_mov_b32 s4, s6
3043 ; GFX7-NEXT: s_mov_b32 s5, s6
3044 ; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
3045 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3046 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3048 ; GFX8-LABEL: test_arg_store_v4bf16:
3050 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3051 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
3052 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3053 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3055 ; GFX9-LABEL: test_arg_store_v4bf16:
3057 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3058 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
3059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3060 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3062 ; GFX10-LABEL: test_arg_store_v4bf16:
3064 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3065 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
3066 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3068 ; GFX11-LABEL: test_arg_store_v4bf16:
3070 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3071 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
3072 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3073 store <4 x bfloat> %in, ptr addrspace(1) %out
3077 define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
3078 ; GCN-LABEL: test_arg_store_v8bf16:
3080 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3081 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3082 ; GCN-NEXT: s_mov_b32 s6, 0
3083 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
3084 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
3085 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
3086 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
3087 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
3088 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
3089 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3090 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3091 ; GCN-NEXT: s_mov_b32 s4, s6
3092 ; GCN-NEXT: s_mov_b32 s5, s6
3093 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
3094 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3095 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
3096 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
3097 ; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
3098 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
3099 ; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
3100 ; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
3101 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
3102 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3103 ; GCN-NEXT: s_setpc_b64 s[30:31]
3105 ; GFX7-LABEL: test_arg_store_v8bf16:
3107 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3108 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
3109 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
3110 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
3111 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3112 ; GFX7-NEXT: s_mov_b32 s6, 0
3113 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
3114 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
3115 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3116 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
3117 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3118 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
3119 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3120 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3121 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3122 ; GFX7-NEXT: s_mov_b32 s4, s6
3123 ; GFX7-NEXT: s_mov_b32 s5, s6
3124 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
3125 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
3126 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
3127 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
3128 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
3129 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3130 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3132 ; GFX8-LABEL: test_arg_store_v8bf16:
3134 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3135 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3136 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3137 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3139 ; GFX9-LABEL: test_arg_store_v8bf16:
3141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3142 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3143 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3144 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3146 ; GFX10-LABEL: test_arg_store_v8bf16:
3148 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3149 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3150 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3152 ; GFX11-LABEL: test_arg_store_v8bf16:
3154 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3155 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
3156 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3157 store <8 x bfloat> %in, ptr addrspace(1) %out
3161 define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
3162 ; GCN-LABEL: test_arg_store_v16bf16:
3164 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3165 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
3166 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
3167 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
3168 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
3169 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
3170 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
3171 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3172 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3173 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3174 ; GCN-NEXT: s_mov_b32 s6, 0
3175 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
3176 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
3177 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
3178 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
3179 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
3180 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
3181 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
3182 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
3183 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
3184 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3185 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
3186 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
3187 ; GCN-NEXT: s_mov_b32 s4, s6
3188 ; GCN-NEXT: s_mov_b32 s5, s6
3189 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
3190 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
3191 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
3192 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
3193 ; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
3194 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
3195 ; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
3196 ; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
3197 ; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
3198 ; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
3199 ; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
3200 ; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
3201 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
3202 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
3203 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3204 ; GCN-NEXT: s_setpc_b64 s[30:31]
3206 ; GFX7-LABEL: test_arg_store_v16bf16:
3208 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3209 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
3210 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
3211 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3212 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
3213 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
3214 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3215 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
3216 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3217 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3218 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
3219 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
3220 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
3221 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
3222 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3223 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
3224 ; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
3225 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
3226 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3227 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
3228 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
3229 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
3230 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3231 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
3232 ; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
3233 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
3234 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
3235 ; GFX7-NEXT: s_mov_b32 s6, 0
3236 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3237 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
3238 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
3239 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
3240 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3241 ; GFX7-NEXT: s_mov_b32 s4, s6
3242 ; GFX7-NEXT: s_mov_b32 s5, s6
3243 ; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
3244 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
3245 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
3246 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
3247 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3248 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3250 ; GFX8-LABEL: test_arg_store_v16bf16:
3252 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3253 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
3254 ; GFX8-NEXT: s_nop 0
3255 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
3256 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
3257 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
3258 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3259 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3261 ; GFX9-LABEL: test_arg_store_v16bf16:
3263 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3264 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
3265 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
3266 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3267 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3269 ; GFX10-LABEL: test_arg_store_v16bf16:
3271 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3272 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
3273 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
3274 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3276 ; GFX11-LABEL: test_arg_store_v16bf16:
3278 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3279 ; GFX11-NEXT: s_clause 0x1
3280 ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
3281 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
3282 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3283 store <16 x bfloat> %in, ptr addrspace(1) %out
3287 define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
3288 ; GCN-LABEL: test_inreg_arg_store:
3290 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3291 ; GCN-NEXT: s_mov_b32 s39, 0xf000
3292 ; GCN-NEXT: s_mov_b32 s38, 0
3293 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4
3294 ; GCN-NEXT: s_mov_b32 s36, s38
3295 ; GCN-NEXT: s_mov_b32 s37, s38
3296 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
3297 ; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
3298 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3299 ; GCN-NEXT: s_setpc_b64 s[30:31]
3301 ; GFX7-LABEL: test_inreg_arg_store:
3303 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3304 ; GFX7-NEXT: s_mov_b32 s38, 0
3305 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
3306 ; GFX7-NEXT: s_mov_b32 s39, 0xf000
3307 ; GFX7-NEXT: s_mov_b32 s36, s38
3308 ; GFX7-NEXT: s_mov_b32 s37, s38
3309 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
3310 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
3311 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3312 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3314 ; GFX8-LABEL: test_inreg_arg_store:
3316 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3317 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
3318 ; GFX8-NEXT: flat_store_short v[0:1], v2
3319 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3320 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3322 ; GFX9-LABEL: test_inreg_arg_store:
3324 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3325 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
3326 ; GFX9-NEXT: global_store_short v[0:1], v2, off
3327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3328 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3330 ; GFX10-LABEL: test_inreg_arg_store:
3332 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
3334 ; GFX10-NEXT: global_store_short v[0:1], v2, off
3335 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3337 ; GFX11-LABEL: test_inreg_arg_store:
3339 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3340 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
3341 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
3342 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3343 store bfloat %in, ptr addrspace(1) %out
3347 define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
3348 ; GCN-LABEL: test_byval:
3350 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3351 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0
3352 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3353 ; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32
3354 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3355 ; GCN-NEXT: s_setpc_b64 s[30:31]
3357 ; GFX7-LABEL: test_byval:
3359 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3360 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0
3361 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3362 ; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32
3363 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3364 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3366 ; GFX8-LABEL: test_byval:
3368 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3369 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
3370 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3371 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3373 ; GFX9-LABEL: test_byval:
3375 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3376 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
3377 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3378 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3380 ; GFX10-LABEL: test_byval:
3382 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3383 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
3384 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3386 ; GFX11-LABEL: test_byval:
3388 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3389 ; GFX11-NEXT: scratch_store_b16 off, v0, s32
3390 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3391 store bfloat %val, ptr addrspace(5) %bv
3392 %retval = load bfloat, ptr addrspace(5) %bv
3396 define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
3397 ; GCN-LABEL: test_sret:
3399 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3400 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3401 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3402 ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3403 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3404 ; GCN-NEXT: s_setpc_b64 s[30:31]
3406 ; GFX7-LABEL: test_sret:
3408 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3409 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
3410 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3411 ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3412 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3413 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3415 ; GFX8-LABEL: test_sret:
3417 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3418 ; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3419 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3420 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3422 ; GFX9-LABEL: test_sret:
3424 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3425 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3426 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3427 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3429 ; GFX10-LABEL: test_sret:
3431 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3432 ; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
3433 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3435 ; GFX11-LABEL: test_sret:
3437 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3438 ; GFX11-NEXT: scratch_store_b16 v0, v1, off
3439 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3440 store bfloat %val, ptr addrspace(5) %sret
3444 define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
3445 ; GCN-LABEL: test_bitcast_from_bfloat:
3447 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3448 ; GCN-NEXT: s_mov_b32 s6, 0
3449 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3450 ; GCN-NEXT: s_mov_b32 s4, s6
3451 ; GCN-NEXT: s_mov_b32 s5, s6
3452 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
3453 ; GCN-NEXT: s_waitcnt vmcnt(0)
3454 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
3455 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3456 ; GCN-NEXT: s_setpc_b64 s[30:31]
3458 ; GFX7-LABEL: test_bitcast_from_bfloat:
3460 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3461 ; GFX7-NEXT: s_mov_b32 s6, 0
3462 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3463 ; GFX7-NEXT: s_mov_b32 s4, s6
3464 ; GFX7-NEXT: s_mov_b32 s5, s6
3465 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
3466 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3467 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
3468 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3469 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3471 ; GFX8-LABEL: test_bitcast_from_bfloat:
3473 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3474 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
3475 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3476 ; GFX8-NEXT: flat_store_short v[2:3], v0
3477 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3478 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3480 ; GFX9-LABEL: test_bitcast_from_bfloat:
3482 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3483 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
3484 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3485 ; GFX9-NEXT: global_store_short v[2:3], v0, off
3486 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3487 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3489 ; GFX10-LABEL: test_bitcast_from_bfloat:
3491 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3492 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
3493 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3494 ; GFX10-NEXT: global_store_short v[2:3], v0, off
3495 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3497 ; GFX11-LABEL: test_bitcast_from_bfloat:
3499 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3500 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
3501 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3502 ; GFX11-NEXT: global_store_b16 v[2:3], v0, off
3503 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3504 %val = load bfloat, ptr addrspace(1) %in
3505 %val_int = bitcast bfloat %val to i16
3506 store i16 %val_int, ptr addrspace(1) %out
3510 define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
3511 ; GCN-LABEL: test_bitcast_to_bfloat:
3513 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3514 ; GCN-NEXT: s_mov_b32 s6, 0
3515 ; GCN-NEXT: s_mov_b32 s7, 0xf000
3516 ; GCN-NEXT: s_mov_b32 s4, s6
3517 ; GCN-NEXT: s_mov_b32 s5, s6
3518 ; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
3519 ; GCN-NEXT: s_waitcnt vmcnt(0)
3520 ; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
3521 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3522 ; GCN-NEXT: s_setpc_b64 s[30:31]
3524 ; GFX7-LABEL: test_bitcast_to_bfloat:
3526 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3527 ; GFX7-NEXT: s_mov_b32 s6, 0
3528 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3529 ; GFX7-NEXT: s_mov_b32 s4, s6
3530 ; GFX7-NEXT: s_mov_b32 s5, s6
3531 ; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
3532 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3533 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
3534 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3535 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3537 ; GFX8-LABEL: test_bitcast_to_bfloat:
3539 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3540 ; GFX8-NEXT: flat_load_ushort v2, v[2:3]
3541 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3542 ; GFX8-NEXT: flat_store_short v[0:1], v2
3543 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3544 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3546 ; GFX9-LABEL: test_bitcast_to_bfloat:
3548 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3549 ; GFX9-NEXT: global_load_ushort v2, v[2:3], off
3550 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3551 ; GFX9-NEXT: global_store_short v[0:1], v2, off
3552 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3553 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3555 ; GFX10-LABEL: test_bitcast_to_bfloat:
3557 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3558 ; GFX10-NEXT: global_load_ushort v2, v[2:3], off
3559 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3560 ; GFX10-NEXT: global_store_short v[0:1], v2, off
3561 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3563 ; GFX11-LABEL: test_bitcast_to_bfloat:
3565 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3566 ; GFX11-NEXT: global_load_u16 v2, v[2:3], off
3567 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3568 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
3569 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3570 %val = load i16, ptr addrspace(1) %in
3571 %val_fp = bitcast i16 %val to bfloat
3572 store bfloat %val_fp, ptr addrspace(1) %out
3576 define bfloat @test_ret(bfloat %in) {
3577 ; GCN-LABEL: test_ret:
3578 ; GCN: ; %bb.0: ; %entry
3579 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580 ; GCN-NEXT: s_setpc_b64 s[30:31]
3582 ; GFX7-LABEL: test_ret:
3583 ; GFX7: ; %bb.0: ; %entry
3584 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3585 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3587 ; GFX8-LABEL: test_ret:
3588 ; GFX8: ; %bb.0: ; %entry
3589 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3590 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3592 ; GFX9-LABEL: test_ret:
3593 ; GFX9: ; %bb.0: ; %entry
3594 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3595 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3597 ; GFX10-LABEL: test_ret:
3598 ; GFX10: ; %bb.0: ; %entry
3599 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3600 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3602 ; GFX11-LABEL: test_ret:
3603 ; GFX11: ; %bb.0: ; %entry
3604 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3605 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3610 define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
3611 ; GCN-LABEL: test_ret_v2bf16:
3612 ; GCN: ; %bb.0: ; %entry
3613 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3614 ; GCN-NEXT: s_setpc_b64 s[30:31]
3616 ; GFX7-LABEL: test_ret_v2bf16:
3617 ; GFX7: ; %bb.0: ; %entry
3618 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3619 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3621 ; GFX8-LABEL: test_ret_v2bf16:
3622 ; GFX8: ; %bb.0: ; %entry
3623 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3624 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3626 ; GFX9-LABEL: test_ret_v2bf16:
3627 ; GFX9: ; %bb.0: ; %entry
3628 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3629 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3631 ; GFX10-LABEL: test_ret_v2bf16:
3632 ; GFX10: ; %bb.0: ; %entry
3633 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3634 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3636 ; GFX11-LABEL: test_ret_v2bf16:
3637 ; GFX11: ; %bb.0: ; %entry
3638 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3639 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3641 ret <2 x bfloat> %in
3644 define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
3645 ; GCN-LABEL: test_ret_v3bf16:
3646 ; GCN: ; %bb.0: ; %entry
3647 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3648 ; GCN-NEXT: s_setpc_b64 s[30:31]
3650 ; GFX7-LABEL: test_ret_v3bf16:
3651 ; GFX7: ; %bb.0: ; %entry
3652 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3653 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3655 ; GFX8-LABEL: test_ret_v3bf16:
3656 ; GFX8: ; %bb.0: ; %entry
3657 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3658 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3660 ; GFX9-LABEL: test_ret_v3bf16:
3661 ; GFX9: ; %bb.0: ; %entry
3662 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3663 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3665 ; GFX10-LABEL: test_ret_v3bf16:
3666 ; GFX10: ; %bb.0: ; %entry
3667 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3670 ; GFX11-LABEL: test_ret_v3bf16:
3671 ; GFX11: ; %bb.0: ; %entry
3672 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3673 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3675 ret <3 x bfloat> %in
3678 define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
3679 ; GCN-LABEL: test_ret_v4bf16:
3680 ; GCN: ; %bb.0: ; %entry
3681 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3682 ; GCN-NEXT: s_setpc_b64 s[30:31]
3684 ; GFX7-LABEL: test_ret_v4bf16:
3685 ; GFX7: ; %bb.0: ; %entry
3686 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3687 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3689 ; GFX8-LABEL: test_ret_v4bf16:
3690 ; GFX8: ; %bb.0: ; %entry
3691 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3692 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3694 ; GFX9-LABEL: test_ret_v4bf16:
3695 ; GFX9: ; %bb.0: ; %entry
3696 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3697 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3699 ; GFX10-LABEL: test_ret_v4bf16:
3700 ; GFX10: ; %bb.0: ; %entry
3701 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3702 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3704 ; GFX11-LABEL: test_ret_v4bf16:
3705 ; GFX11: ; %bb.0: ; %entry
3706 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3707 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3709 ret <4 x bfloat> %in
3712 define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
3713 ; GCN-LABEL: test_ret_v8bf16:
3714 ; GCN: ; %bb.0: ; %entry
3715 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3716 ; GCN-NEXT: s_setpc_b64 s[30:31]
3718 ; GFX7-LABEL: test_ret_v8bf16:
3719 ; GFX7: ; %bb.0: ; %entry
3720 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3721 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3723 ; GFX8-LABEL: test_ret_v8bf16:
3724 ; GFX8: ; %bb.0: ; %entry
3725 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3726 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3728 ; GFX9-LABEL: test_ret_v8bf16:
3729 ; GFX9: ; %bb.0: ; %entry
3730 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3731 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3733 ; GFX10-LABEL: test_ret_v8bf16:
3734 ; GFX10: ; %bb.0: ; %entry
3735 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3736 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3738 ; GFX11-LABEL: test_ret_v8bf16:
3739 ; GFX11: ; %bb.0: ; %entry
3740 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3741 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3743 ret <8 x bfloat> %in
3746 define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
3747 ; GCN-LABEL: test_ret_v16bf16:
3748 ; GCN: ; %bb.0: ; %entry
3749 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3750 ; GCN-NEXT: s_setpc_b64 s[30:31]
3752 ; GFX7-LABEL: test_ret_v16bf16:
3753 ; GFX7: ; %bb.0: ; %entry
3754 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3755 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3757 ; GFX8-LABEL: test_ret_v16bf16:
3758 ; GFX8: ; %bb.0: ; %entry
3759 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3760 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3762 ; GFX9-LABEL: test_ret_v16bf16:
3763 ; GFX9: ; %bb.0: ; %entry
3764 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3765 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3767 ; GFX10-LABEL: test_ret_v16bf16:
3768 ; GFX10: ; %bb.0: ; %entry
3769 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3770 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3772 ; GFX11-LABEL: test_ret_v16bf16:
3773 ; GFX11: ; %bb.0: ; %entry
3774 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3775 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3777 ret <16 x bfloat> %in
3780 define void @test_call(bfloat %in, ptr addrspace(5) %out) {
3781 ; GCN-LABEL: test_call:
3782 ; GCN: ; %bb.0: ; %entry
3783 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3784 ; GCN-NEXT: s_mov_b32 s18, s33
3785 ; GCN-NEXT: s_mov_b32 s33, s32
3786 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
3787 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3788 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
3789 ; GCN-NEXT: s_addk_i32 s32, 0x400
3790 ; GCN-NEXT: s_waitcnt expcnt(0)
3791 ; GCN-NEXT: v_writelane_b32 v2, s30, 0
3792 ; GCN-NEXT: v_writelane_b32 v2, s31, 1
3793 ; GCN-NEXT: s_getpc_b64 s[16:17]
3794 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3795 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3796 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3797 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
3798 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
3799 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3800 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3801 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3802 ; GCN-NEXT: s_waitcnt vmcnt(0)
3803 ; GCN-NEXT: v_readlane_b32 s31, v2, 1
3804 ; GCN-NEXT: v_readlane_b32 s30, v2, 0
3805 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
3806 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3807 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
3808 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
3809 ; GCN-NEXT: s_mov_b32 s33, s18
3810 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3811 ; GCN-NEXT: s_setpc_b64 s[30:31]
3813 ; GFX7-LABEL: test_call:
3814 ; GFX7: ; %bb.0: ; %entry
3815 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3816 ; GFX7-NEXT: s_mov_b32 s18, s33
3817 ; GFX7-NEXT: s_mov_b32 s33, s32
3818 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
3819 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3820 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
3821 ; GFX7-NEXT: s_addk_i32 s32, 0x400
3822 ; GFX7-NEXT: s_getpc_b64 s[16:17]
3823 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3824 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3825 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3826 ; GFX7-NEXT: v_writelane_b32 v2, s30, 0
3827 ; GFX7-NEXT: v_writelane_b32 v2, s31, 1
3828 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3829 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
3830 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3831 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3832 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3833 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3834 ; GFX7-NEXT: v_readlane_b32 s31, v2, 1
3835 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0
3836 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
3837 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3838 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
3839 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00
3840 ; GFX7-NEXT: s_mov_b32 s33, s18
3841 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3842 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3844 ; GFX8-LABEL: test_call:
3845 ; GFX8: ; %bb.0: ; %entry
3846 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3847 ; GFX8-NEXT: s_mov_b32 s18, s33
3848 ; GFX8-NEXT: s_mov_b32 s33, s32
3849 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
3850 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3851 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
3852 ; GFX8-NEXT: s_addk_i32 s32, 0x400
3853 ; GFX8-NEXT: s_getpc_b64 s[16:17]
3854 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3855 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3856 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3857 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0
3858 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1
3859 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3860 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
3861 ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3862 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3863 ; GFX8-NEXT: v_readlane_b32 s31, v2, 1
3864 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0
3865 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
3866 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3867 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
3868 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00
3869 ; GFX8-NEXT: s_mov_b32 s33, s18
3870 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3871 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3873 ; GFX9-LABEL: test_call:
3874 ; GFX9: ; %bb.0: ; %entry
3875 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3876 ; GFX9-NEXT: s_mov_b32 s18, s33
3877 ; GFX9-NEXT: s_mov_b32 s33, s32
3878 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
3879 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3880 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
3881 ; GFX9-NEXT: s_addk_i32 s32, 0x400
3882 ; GFX9-NEXT: s_getpc_b64 s[16:17]
3883 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3884 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3885 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3886 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0
3887 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1
3888 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3889 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
3890 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3891 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3892 ; GFX9-NEXT: v_readlane_b32 s31, v2, 1
3893 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0
3894 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
3895 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3896 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
3897 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
3898 ; GFX9-NEXT: s_mov_b32 s33, s18
3899 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3900 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3902 ; GFX10-LABEL: test_call:
3903 ; GFX10: ; %bb.0: ; %entry
3904 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3905 ; GFX10-NEXT: s_mov_b32 s18, s33
3906 ; GFX10-NEXT: s_mov_b32 s33, s32
3907 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
3908 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
3909 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
3910 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
3911 ; GFX10-NEXT: s_addk_i32 s32, 0x200
3912 ; GFX10-NEXT: s_getpc_b64 s[16:17]
3913 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
3914 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
3915 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0
3916 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3917 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1
3918 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3919 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
3920 ; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
3921 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3922 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1
3923 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0
3924 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
3925 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
3926 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
3927 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
3928 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
3929 ; GFX10-NEXT: s_mov_b32 s33, s18
3930 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3931 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3933 ; GFX11-LABEL: test_call:
3934 ; GFX11: ; %bb.0: ; %entry
3935 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3936 ; GFX11-NEXT: s_mov_b32 s2, s33
3937 ; GFX11-NEXT: s_mov_b32 s33, s32
3938 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
3939 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
3940 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
3941 ; GFX11-NEXT: s_add_i32 s32, s32, 16
3942 ; GFX11-NEXT: s_getpc_b64 s[0:1]
3943 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
3944 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
3945 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0
3946 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3947 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1
3948 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3949 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
3950 ; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc
3951 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3952 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1
3953 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0
3954 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
3955 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
3956 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
3957 ; GFX11-NEXT: s_add_i32 s32, s32, -16
3958 ; GFX11-NEXT: s_mov_b32 s33, s2
3959 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3960 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3962 %result = call bfloat @test_arg_store(bfloat %in)
3963 store volatile bfloat %result, ptr addrspace(5) %out
3967 define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
3968 ; GCN-LABEL: test_call_v2bf16:
3969 ; GCN: ; %bb.0: ; %entry
3970 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3971 ; GCN-NEXT: s_mov_b32 s18, s33
3972 ; GCN-NEXT: s_mov_b32 s33, s32
3973 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
3974 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
3975 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
3976 ; GCN-NEXT: s_addk_i32 s32, 0x400
3977 ; GCN-NEXT: s_waitcnt expcnt(0)
3978 ; GCN-NEXT: v_writelane_b32 v4, s30, 0
3979 ; GCN-NEXT: v_writelane_b32 v4, s31, 1
3980 ; GCN-NEXT: s_getpc_b64 s[16:17]
3981 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
3982 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
3983 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
3984 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
3985 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
3986 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
3987 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
3988 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2
3989 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3990 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3991 ; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
3992 ; GCN-NEXT: s_waitcnt vmcnt(0)
3993 ; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
3994 ; GCN-NEXT: s_waitcnt vmcnt(0)
3995 ; GCN-NEXT: v_readlane_b32 s31, v4, 1
3996 ; GCN-NEXT: v_readlane_b32 s30, v4, 0
3997 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
3998 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
3999 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4000 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
4001 ; GCN-NEXT: s_mov_b32 s33, s18
4002 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4003 ; GCN-NEXT: s_setpc_b64 s[30:31]
4005 ; GFX7-LABEL: test_call_v2bf16:
4006 ; GFX7: ; %bb.0: ; %entry
4007 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4008 ; GFX7-NEXT: s_mov_b32 s18, s33
4009 ; GFX7-NEXT: s_mov_b32 s33, s32
4010 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4011 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4012 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4013 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4014 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4015 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4016 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4017 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4018 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0
4019 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1
4020 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4021 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4022 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4023 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4024 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4025 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2
4026 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4027 ; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
4028 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4029 ; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
4030 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4031 ; GFX7-NEXT: v_readlane_b32 s31, v4, 1
4032 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0
4033 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4034 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4035 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4036 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00
4037 ; GFX7-NEXT: s_mov_b32 s33, s18
4038 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4039 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4041 ; GFX8-LABEL: test_call_v2bf16:
4042 ; GFX8: ; %bb.0: ; %entry
4043 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4044 ; GFX8-NEXT: s_mov_b32 s18, s33
4045 ; GFX8-NEXT: s_mov_b32 s33, s32
4046 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4047 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4048 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4049 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4050 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4051 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4052 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4053 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4054 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0
4055 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1
4056 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4057 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4058 ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
4059 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4060 ; GFX8-NEXT: v_readlane_b32 s31, v2, 1
4061 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0
4062 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4063 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4064 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4065 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00
4066 ; GFX8-NEXT: s_mov_b32 s33, s18
4067 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4068 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4070 ; GFX9-LABEL: test_call_v2bf16:
4071 ; GFX9: ; %bb.0: ; %entry
4072 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4073 ; GFX9-NEXT: s_mov_b32 s18, s33
4074 ; GFX9-NEXT: s_mov_b32 s33, s32
4075 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4076 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4077 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4078 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4079 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4080 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4081 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4082 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4083 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0
4084 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1
4085 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4086 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4087 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
4088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4089 ; GFX9-NEXT: v_readlane_b32 s31, v2, 1
4090 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0
4091 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4092 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4093 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4094 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
4095 ; GFX9-NEXT: s_mov_b32 s33, s18
4096 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4097 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4099 ; GFX10-LABEL: test_call_v2bf16:
4100 ; GFX10: ; %bb.0: ; %entry
4101 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4102 ; GFX10-NEXT: s_mov_b32 s18, s33
4103 ; GFX10-NEXT: s_mov_b32 s33, s32
4104 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4105 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
4106 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4107 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4108 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4109 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4110 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4111 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4112 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0
4113 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4114 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1
4115 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4116 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4117 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
4118 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4119 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1
4120 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0
4121 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4122 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
4123 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4124 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4125 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
4126 ; GFX10-NEXT: s_mov_b32 s33, s18
4127 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4128 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4130 ; GFX11-LABEL: test_call_v2bf16:
4131 ; GFX11: ; %bb.0: ; %entry
4132 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4133 ; GFX11-NEXT: s_mov_b32 s2, s33
4134 ; GFX11-NEXT: s_mov_b32 s33, s32
4135 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4136 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
4137 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4138 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4139 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4140 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4141 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4142 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0
4143 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4144 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1
4145 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4146 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4147 ; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc
4148 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4149 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1
4150 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0
4151 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4152 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
4153 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4154 ; GFX11-NEXT: s_add_i32 s32, s32, -16
4155 ; GFX11-NEXT: s_mov_b32 s33, s2
4156 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4157 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4159 %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
4160 store volatile <2 x bfloat> %result, ptr addrspace(5) %out
4164 define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
4165 ; GCN-LABEL: test_call_v3bf16:
4166 ; GCN: ; %bb.0: ; %entry
4167 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4168 ; GCN-NEXT: s_mov_b32 s18, s33
4169 ; GCN-NEXT: s_mov_b32 s33, s32
4170 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4171 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4172 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4173 ; GCN-NEXT: s_addk_i32 s32, 0x400
4174 ; GCN-NEXT: s_waitcnt expcnt(0)
4175 ; GCN-NEXT: v_writelane_b32 v5, s30, 0
4176 ; GCN-NEXT: v_writelane_b32 v5, s31, 1
4177 ; GCN-NEXT: s_getpc_b64 s[16:17]
4178 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4179 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4180 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4181 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4182 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4183 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4184 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4185 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4186 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3
4187 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4188 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4189 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
4190 ; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen
4191 ; GCN-NEXT: s_waitcnt vmcnt(0)
4192 ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
4193 ; GCN-NEXT: s_waitcnt vmcnt(0)
4194 ; GCN-NEXT: v_readlane_b32 s31, v5, 1
4195 ; GCN-NEXT: v_readlane_b32 s30, v5, 0
4196 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4197 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4198 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4199 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
4200 ; GCN-NEXT: s_mov_b32 s33, s18
4201 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4202 ; GCN-NEXT: s_setpc_b64 s[30:31]
4204 ; GFX7-LABEL: test_call_v3bf16:
4205 ; GFX7: ; %bb.0: ; %entry
4206 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4207 ; GFX7-NEXT: s_mov_b32 s18, s33
4208 ; GFX7-NEXT: s_mov_b32 s33, s32
4209 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4210 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4211 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4212 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4213 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4214 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4215 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4216 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4217 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0
4218 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1
4219 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4220 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4221 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4222 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4223 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4224 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
4225 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
4226 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4227 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3
4228 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
4229 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4230 ; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
4231 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4232 ; GFX7-NEXT: v_readlane_b32 s31, v4, 1
4233 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0
4234 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4235 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4236 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4237 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00
4238 ; GFX7-NEXT: s_mov_b32 s33, s18
4239 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4240 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4242 ; GFX8-LABEL: test_call_v3bf16:
4243 ; GFX8: ; %bb.0: ; %entry
4244 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4245 ; GFX8-NEXT: s_mov_b32 s18, s33
4246 ; GFX8-NEXT: s_mov_b32 s33, s32
4247 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4248 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4249 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4250 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4251 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4252 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4253 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4254 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4255 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0
4256 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1
4257 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4258 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4259 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
4260 ; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
4261 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4262 ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4263 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4264 ; GFX8-NEXT: v_readlane_b32 s31, v4, 1
4265 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0
4266 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4267 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4268 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4269 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00
4270 ; GFX8-NEXT: s_mov_b32 s33, s18
4271 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4272 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4274 ; GFX9-LABEL: test_call_v3bf16:
4275 ; GFX9: ; %bb.0: ; %entry
4276 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4277 ; GFX9-NEXT: s_mov_b32 s18, s33
4278 ; GFX9-NEXT: s_mov_b32 s33, s32
4279 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4280 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4281 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4282 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4283 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4284 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4285 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4286 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4287 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0
4288 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1
4289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4290 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4291 ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
4292 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4293 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4294 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4295 ; GFX9-NEXT: v_readlane_b32 s31, v3, 1
4296 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0
4297 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4298 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4299 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4300 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
4301 ; GFX9-NEXT: s_mov_b32 s33, s18
4302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4303 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4305 ; GFX10-LABEL: test_call_v3bf16:
4306 ; GFX10: ; %bb.0: ; %entry
4307 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4308 ; GFX10-NEXT: s_mov_b32 s18, s33
4309 ; GFX10-NEXT: s_mov_b32 s33, s32
4310 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4311 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4312 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4313 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4314 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4315 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4316 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4317 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4318 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0
4319 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4320 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1
4321 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4322 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4323 ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
4324 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4325 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4326 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4327 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1
4328 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0
4329 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4330 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4331 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4332 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4333 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
4334 ; GFX10-NEXT: s_mov_b32 s33, s18
4335 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4336 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4338 ; GFX11-LABEL: test_call_v3bf16:
4339 ; GFX11: ; %bb.0: ; %entry
4340 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4341 ; GFX11-NEXT: s_mov_b32 s2, s33
4342 ; GFX11-NEXT: s_mov_b32 s33, s32
4343 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4344 ; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
4345 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4346 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4347 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4348 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4349 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4350 ; GFX11-NEXT: v_writelane_b32 v3, s30, 0
4351 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4352 ; GFX11-NEXT: v_writelane_b32 v3, s31, 1
4353 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4354 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4355 ; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc
4356 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4357 ; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc
4358 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4359 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1
4360 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0
4361 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4362 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
4363 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4364 ; GFX11-NEXT: s_add_i32 s32, s32, -16
4365 ; GFX11-NEXT: s_mov_b32 s33, s2
4366 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4367 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4369 %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
4370 store volatile <3 x bfloat> %result, ptr addrspace(5) %out
4374 define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
4375 ; GCN-LABEL: test_call_v4bf16:
4376 ; GCN: ; %bb.0: ; %entry
4377 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4378 ; GCN-NEXT: s_mov_b32 s18, s33
4379 ; GCN-NEXT: s_mov_b32 s33, s32
4380 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4381 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
4382 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4383 ; GCN-NEXT: s_addk_i32 s32, 0x400
4384 ; GCN-NEXT: s_waitcnt expcnt(0)
4385 ; GCN-NEXT: v_writelane_b32 v8, s30, 0
4386 ; GCN-NEXT: v_writelane_b32 v8, s31, 1
4387 ; GCN-NEXT: s_getpc_b64 s[16:17]
4388 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4389 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4390 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4391 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4392 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4393 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4394 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4395 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4396 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
4397 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4
4398 ; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4
4399 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4
4400 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4401 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4402 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4403 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4404 ; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen
4405 ; GCN-NEXT: s_waitcnt vmcnt(0)
4406 ; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen
4407 ; GCN-NEXT: s_waitcnt vmcnt(0)
4408 ; GCN-NEXT: buffer_store_short v1, v7, s[0:3], 0 offen
4409 ; GCN-NEXT: s_waitcnt vmcnt(0)
4410 ; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
4411 ; GCN-NEXT: s_waitcnt vmcnt(0)
4412 ; GCN-NEXT: v_readlane_b32 s31, v8, 1
4413 ; GCN-NEXT: v_readlane_b32 s30, v8, 0
4414 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4415 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
4416 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4417 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
4418 ; GCN-NEXT: s_mov_b32 s33, s18
4419 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4420 ; GCN-NEXT: s_setpc_b64 s[30:31]
4422 ; GFX7-LABEL: test_call_v4bf16:
4423 ; GFX7: ; %bb.0: ; %entry
4424 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4425 ; GFX7-NEXT: s_mov_b32 s18, s33
4426 ; GFX7-NEXT: s_mov_b32 s33, s32
4427 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4428 ; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
4429 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4430 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4431 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4432 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4433 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4434 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4435 ; GFX7-NEXT: v_writelane_b32 v6, s30, 0
4436 ; GFX7-NEXT: v_writelane_b32 v6, s31, 1
4437 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4438 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4439 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
4440 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
4441 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4442 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4
4443 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4444 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4445 ; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen
4446 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4447 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4
4448 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4449 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4450 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
4451 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4452 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4
4453 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4454 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
4455 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4456 ; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
4457 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4458 ; GFX7-NEXT: v_readlane_b32 s31, v6, 1
4459 ; GFX7-NEXT: v_readlane_b32 s30, v6, 0
4460 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4461 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
4462 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4463 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00
4464 ; GFX7-NEXT: s_mov_b32 s33, s18
4465 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4466 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4468 ; GFX8-LABEL: test_call_v4bf16:
4469 ; GFX8: ; %bb.0: ; %entry
4470 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4471 ; GFX8-NEXT: s_mov_b32 s18, s33
4472 ; GFX8-NEXT: s_mov_b32 s33, s32
4473 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4474 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
4475 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4476 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4477 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4478 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4479 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4480 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4481 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0
4482 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1
4483 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4484 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4485 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
4486 ; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
4487 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4488 ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4489 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4490 ; GFX8-NEXT: v_readlane_b32 s31, v4, 1
4491 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0
4492 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4493 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
4494 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4495 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00
4496 ; GFX8-NEXT: s_mov_b32 s33, s18
4497 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4498 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4500 ; GFX9-LABEL: test_call_v4bf16:
4501 ; GFX9: ; %bb.0: ; %entry
4502 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4503 ; GFX9-NEXT: s_mov_b32 s18, s33
4504 ; GFX9-NEXT: s_mov_b32 s33, s32
4505 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4506 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4507 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4508 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4509 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4510 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4511 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4512 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4513 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0
4514 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1
4515 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4516 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4517 ; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4518 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4519 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4520 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4521 ; GFX9-NEXT: v_readlane_b32 s31, v3, 1
4522 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0
4523 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4524 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4525 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4526 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
4527 ; GFX9-NEXT: s_mov_b32 s33, s18
4528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4529 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4531 ; GFX10-LABEL: test_call_v4bf16:
4532 ; GFX10: ; %bb.0: ; %entry
4533 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4534 ; GFX10-NEXT: s_mov_b32 s18, s33
4535 ; GFX10-NEXT: s_mov_b32 s33, s32
4536 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4537 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
4538 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4539 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4540 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4541 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4542 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4543 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4544 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0
4545 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4546 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1
4547 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4548 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4549 ; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4550 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4551 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
4552 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4553 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1
4554 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0
4555 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4556 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
4557 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4558 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4559 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
4560 ; GFX10-NEXT: s_mov_b32 s33, s18
4561 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4562 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4564 ; GFX11-LABEL: test_call_v4bf16:
4565 ; GFX11: ; %bb.0: ; %entry
4566 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4567 ; GFX11-NEXT: s_mov_b32 s2, s33
4568 ; GFX11-NEXT: s_mov_b32 s33, s32
4569 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4570 ; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
4571 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4572 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4573 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4574 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4575 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4576 ; GFX11-NEXT: v_writelane_b32 v3, s30, 0
4577 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4578 ; GFX11-NEXT: v_writelane_b32 v3, s31, 1
4579 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4580 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4581 ; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc
4582 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4583 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1
4584 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0
4585 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4586 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
4587 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4588 ; GFX11-NEXT: s_add_i32 s32, s32, -16
4589 ; GFX11-NEXT: s_mov_b32 s33, s2
4590 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4591 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4593 %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
4594 store volatile <4 x bfloat> %result, ptr addrspace(5) %out
4598 define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
4599 ; GCN-LABEL: test_call_v8bf16:
4600 ; GCN: ; %bb.0: ; %entry
4601 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4602 ; GCN-NEXT: s_mov_b32 s18, s33
4603 ; GCN-NEXT: s_mov_b32 s33, s32
4604 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4605 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill
4606 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4607 ; GCN-NEXT: s_addk_i32 s32, 0x400
4608 ; GCN-NEXT: s_waitcnt expcnt(0)
4609 ; GCN-NEXT: v_writelane_b32 v16, s30, 0
4610 ; GCN-NEXT: v_writelane_b32 v16, s31, 1
4611 ; GCN-NEXT: s_getpc_b64 s[16:17]
4612 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4613 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4614 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4615 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4616 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4617 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4618 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4619 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4620 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
4621 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
4622 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
4623 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
4624 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
4625 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8
4626 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8
4627 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8
4628 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8
4629 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8
4630 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8
4631 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8
4632 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4633 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4634 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4635 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4636 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4637 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4638 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
4639 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
4640 ; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen
4641 ; GCN-NEXT: s_waitcnt vmcnt(0)
4642 ; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen
4643 ; GCN-NEXT: s_waitcnt vmcnt(0)
4644 ; GCN-NEXT: buffer_store_short v5, v11, s[0:3], 0 offen
4645 ; GCN-NEXT: s_waitcnt vmcnt(0)
4646 ; GCN-NEXT: buffer_store_short v4, v12, s[0:3], 0 offen
4647 ; GCN-NEXT: s_waitcnt vmcnt(0)
4648 ; GCN-NEXT: buffer_store_short v3, v13, s[0:3], 0 offen
4649 ; GCN-NEXT: s_waitcnt vmcnt(0)
4650 ; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen
4651 ; GCN-NEXT: s_waitcnt vmcnt(0)
4652 ; GCN-NEXT: buffer_store_short v1, v15, s[0:3], 0 offen
4653 ; GCN-NEXT: s_waitcnt vmcnt(0)
4654 ; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
4655 ; GCN-NEXT: s_waitcnt vmcnt(0)
4656 ; GCN-NEXT: v_readlane_b32 s31, v16, 1
4657 ; GCN-NEXT: v_readlane_b32 s30, v16, 0
4658 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4659 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
4660 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4661 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
4662 ; GCN-NEXT: s_mov_b32 s33, s18
4663 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4664 ; GCN-NEXT: s_setpc_b64 s[30:31]
4666 ; GFX7-LABEL: test_call_v8bf16:
4667 ; GFX7: ; %bb.0: ; %entry
4668 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4669 ; GFX7-NEXT: s_mov_b32 s18, s33
4670 ; GFX7-NEXT: s_mov_b32 s33, s32
4671 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4672 ; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
4673 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4674 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4675 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4676 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4677 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4678 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4679 ; GFX7-NEXT: v_writelane_b32 v10, s30, 0
4680 ; GFX7-NEXT: v_writelane_b32 v10, s31, 1
4681 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4682 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
4683 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
4684 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
4685 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
4686 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8
4687 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
4688 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
4689 ; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen
4690 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4691 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8
4692 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
4693 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4694 ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen
4695 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4696 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8
4697 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
4698 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4699 ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen
4700 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4701 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8
4702 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
4703 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4704 ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen
4705 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4706 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8
4707 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4708 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4709 ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen
4710 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4711 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8
4712 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4713 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4714 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
4715 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4716 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8
4717 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4718 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
4719 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4720 ; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
4721 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4722 ; GFX7-NEXT: v_readlane_b32 s31, v10, 1
4723 ; GFX7-NEXT: v_readlane_b32 s30, v10, 0
4724 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
4725 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
4726 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
4727 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00
4728 ; GFX7-NEXT: s_mov_b32 s33, s18
4729 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4730 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4732 ; GFX8-LABEL: test_call_v8bf16:
4733 ; GFX8: ; %bb.0: ; %entry
4734 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4735 ; GFX8-NEXT: s_mov_b32 s18, s33
4736 ; GFX8-NEXT: s_mov_b32 s33, s32
4737 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
4738 ; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
4739 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
4740 ; GFX8-NEXT: s_addk_i32 s32, 0x400
4741 ; GFX8-NEXT: s_getpc_b64 s[16:17]
4742 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4743 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4744 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4745 ; GFX8-NEXT: v_writelane_b32 v6, s30, 0
4746 ; GFX8-NEXT: v_writelane_b32 v6, s31, 1
4747 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
4748 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
4749 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4
4750 ; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
4751 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4752 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4
4753 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
4754 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4755 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
4756 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
4757 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4758 ; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
4759 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4760 ; GFX8-NEXT: v_readlane_b32 s31, v6, 1
4761 ; GFX8-NEXT: v_readlane_b32 s30, v6, 0
4762 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
4763 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
4764 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
4765 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00
4766 ; GFX8-NEXT: s_mov_b32 s33, s18
4767 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4768 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4770 ; GFX9-LABEL: test_call_v8bf16:
4771 ; GFX9: ; %bb.0: ; %entry
4772 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4773 ; GFX9-NEXT: s_mov_b32 s18, s33
4774 ; GFX9-NEXT: s_mov_b32 s33, s32
4775 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
4776 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4777 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
4778 ; GFX9-NEXT: s_addk_i32 s32, 0x400
4779 ; GFX9-NEXT: s_getpc_b64 s[16:17]
4780 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4781 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4782 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4783 ; GFX9-NEXT: v_writelane_b32 v5, s30, 0
4784 ; GFX9-NEXT: v_writelane_b32 v5, s31, 1
4785 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4786 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
4787 ; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
4788 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4789 ; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
4790 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4791 ; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4792 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4793 ; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
4794 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4795 ; GFX9-NEXT: v_readlane_b32 s31, v5, 1
4796 ; GFX9-NEXT: v_readlane_b32 s30, v5, 0
4797 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
4798 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4799 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
4800 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
4801 ; GFX9-NEXT: s_mov_b32 s33, s18
4802 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4803 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4805 ; GFX10-LABEL: test_call_v8bf16:
4806 ; GFX10: ; %bb.0: ; %entry
4807 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4808 ; GFX10-NEXT: s_mov_b32 s18, s33
4809 ; GFX10-NEXT: s_mov_b32 s33, s32
4810 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
4811 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
4812 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4813 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
4814 ; GFX10-NEXT: s_addk_i32 s32, 0x200
4815 ; GFX10-NEXT: s_getpc_b64 s[16:17]
4816 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4817 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4818 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0
4819 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4820 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1
4821 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4822 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
4823 ; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
4824 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4825 ; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
4826 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4827 ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4828 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4829 ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
4830 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4831 ; GFX10-NEXT: v_readlane_b32 s31, v5, 1
4832 ; GFX10-NEXT: v_readlane_b32 s30, v5, 0
4833 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
4834 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
4835 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4836 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
4837 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
4838 ; GFX10-NEXT: s_mov_b32 s33, s18
4839 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4840 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4842 ; GFX11-LABEL: test_call_v8bf16:
4843 ; GFX11: ; %bb.0: ; %entry
4844 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4845 ; GFX11-NEXT: s_mov_b32 s2, s33
4846 ; GFX11-NEXT: s_mov_b32 s33, s32
4847 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4848 ; GFX11-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
4849 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4850 ; GFX11-NEXT: s_add_i32 s32, s32, 16
4851 ; GFX11-NEXT: s_getpc_b64 s[0:1]
4852 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
4853 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
4854 ; GFX11-NEXT: v_writelane_b32 v5, s30, 0
4855 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
4856 ; GFX11-NEXT: v_writelane_b32 v5, s31, 1
4857 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4858 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
4859 ; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
4860 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4861 ; GFX11-NEXT: v_readlane_b32 s31, v5, 1
4862 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0
4863 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
4864 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
4865 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
4866 ; GFX11-NEXT: s_add_i32 s32, s32, -16
4867 ; GFX11-NEXT: s_mov_b32 s33, s2
4868 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4869 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4871 %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
4872 store volatile <8 x bfloat> %result, ptr addrspace(5) %out
4876 define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
4877 ; GCN-LABEL: test_call_v16bf16:
4878 ; GCN: ; %bb.0: ; %entry
4879 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4880 ; GCN-NEXT: s_mov_b32 s18, s33
4881 ; GCN-NEXT: s_mov_b32 s33, s32
4882 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
4883 ; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill
4884 ; GCN-NEXT: s_mov_b64 exec, s[16:17]
4885 ; GCN-NEXT: s_addk_i32 s32, 0x400
4886 ; GCN-NEXT: s_waitcnt expcnt(0)
4887 ; GCN-NEXT: v_writelane_b32 v21, s30, 0
4888 ; GCN-NEXT: v_writelane_b32 v21, s31, 1
4889 ; GCN-NEXT: s_getpc_b64 s[16:17]
4890 ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4891 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4892 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4893 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
4894 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
4895 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
4896 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4897 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
4898 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
4899 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
4900 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
4901 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
4902 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
4903 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
4904 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
4905 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
4906 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
4907 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
4908 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
4909 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
4910 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
4911 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16
4912 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16
4913 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16
4914 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16
4915 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
4916 ; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
4917 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4918 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16
4919 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16
4920 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
4921 ; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen
4922 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4923 ; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16
4924 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16
4925 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
4926 ; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen
4927 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4928 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16
4929 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16
4930 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
4931 ; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen
4932 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4933 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16
4934 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16
4935 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
4936 ; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen
4937 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4938 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16
4939 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16
4940 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
4941 ; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen
4942 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4943 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16
4944 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4945 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4946 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4947 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4948 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4949 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4950 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
4951 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
4952 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
4953 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
4954 ; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen
4955 ; GCN-NEXT: s_waitcnt vmcnt(0)
4956 ; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen
4957 ; GCN-NEXT: s_waitcnt vmcnt(0)
4958 ; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen
4959 ; GCN-NEXT: s_waitcnt vmcnt(0)
4960 ; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen
4961 ; GCN-NEXT: s_waitcnt vmcnt(0)
4962 ; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen
4963 ; GCN-NEXT: s_waitcnt vmcnt(0)
4964 ; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen
4965 ; GCN-NEXT: s_waitcnt vmcnt(0)
4966 ; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen
4967 ; GCN-NEXT: s_waitcnt vmcnt(0)
4968 ; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen
4969 ; GCN-NEXT: s_waitcnt vmcnt(0)
4970 ; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen
4971 ; GCN-NEXT: s_waitcnt vmcnt(0)
4972 ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
4973 ; GCN-NEXT: s_waitcnt vmcnt(0)
4974 ; GCN-NEXT: v_readlane_b32 s31, v21, 1
4975 ; GCN-NEXT: v_readlane_b32 s30, v21, 0
4976 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
4977 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload
4978 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
4979 ; GCN-NEXT: s_addk_i32 s32, 0xfc00
4980 ; GCN-NEXT: s_mov_b32 s33, s18
4981 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4982 ; GCN-NEXT: s_setpc_b64 s[30:31]
4984 ; GFX7-LABEL: test_call_v16bf16:
4985 ; GFX7: ; %bb.0: ; %entry
4986 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4987 ; GFX7-NEXT: s_mov_b32 s18, s33
4988 ; GFX7-NEXT: s_mov_b32 s33, s32
4989 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
4990 ; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
4991 ; GFX7-NEXT: s_mov_b64 exec, s[16:17]
4992 ; GFX7-NEXT: s_addk_i32 s32, 0x400
4993 ; GFX7-NEXT: s_getpc_b64 s[16:17]
4994 ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
4995 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
4996 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
4997 ; GFX7-NEXT: v_writelane_b32 v18, s30, 0
4998 ; GFX7-NEXT: v_writelane_b32 v18, s31, 1
4999 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5000 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
5001 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
5002 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
5003 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
5004 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16
5005 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
5006 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
5007 ; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
5008 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5009 ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16
5010 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
5011 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
5012 ; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen
5013 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5014 ; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16
5015 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
5016 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
5017 ; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen
5018 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5019 ; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16
5020 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
5021 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
5022 ; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen
5023 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5024 ; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16
5025 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
5026 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
5027 ; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen
5028 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5029 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16
5030 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
5031 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
5032 ; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen
5033 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5034 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16
5035 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
5036 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
5037 ; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen
5038 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5039 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16
5040 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
5041 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
5042 ; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen
5043 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5044 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16
5045 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
5046 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
5047 ; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen
5048 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5049 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16
5050 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
5051 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
5052 ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen
5053 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5054 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16
5055 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
5056 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
5057 ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen
5058 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5059 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16
5060 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
5061 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
5062 ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen
5063 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5064 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16
5065 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
5066 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
5067 ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen
5068 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5069 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16
5070 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
5071 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5072 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
5073 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5074 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16
5075 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5076 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
5077 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5078 ; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
5079 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5080 ; GFX7-NEXT: v_readlane_b32 s31, v18, 1
5081 ; GFX7-NEXT: v_readlane_b32 s30, v18, 0
5082 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
5083 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
5084 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
5085 ; GFX7-NEXT: s_addk_i32 s32, 0xfc00
5086 ; GFX7-NEXT: s_mov_b32 s33, s18
5087 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5088 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5090 ; GFX8-LABEL: test_call_v16bf16:
5091 ; GFX8: ; %bb.0: ; %entry
5092 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5093 ; GFX8-NEXT: s_mov_b32 s18, s33
5094 ; GFX8-NEXT: s_mov_b32 s33, s32
5095 ; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
5096 ; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
5097 ; GFX8-NEXT: s_mov_b64 exec, s[16:17]
5098 ; GFX8-NEXT: s_addk_i32 s32, 0x400
5099 ; GFX8-NEXT: s_getpc_b64 s[16:17]
5100 ; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5101 ; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5102 ; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
5103 ; GFX8-NEXT: v_writelane_b32 v10, s30, 0
5104 ; GFX8-NEXT: v_writelane_b32 v10, s31, 1
5105 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
5106 ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
5107 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8
5108 ; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
5109 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5110 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8
5111 ; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
5112 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5113 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v8
5114 ; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
5115 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5116 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v8
5117 ; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
5118 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5119 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 12, v8
5120 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
5121 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5122 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v8
5123 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
5124 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5125 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8
5126 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
5127 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5128 ; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
5129 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5130 ; GFX8-NEXT: v_readlane_b32 s31, v10, 1
5131 ; GFX8-NEXT: v_readlane_b32 s30, v10, 0
5132 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
5133 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
5134 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
5135 ; GFX8-NEXT: s_addk_i32 s32, 0xfc00
5136 ; GFX8-NEXT: s_mov_b32 s33, s18
5137 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5138 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5140 ; GFX9-LABEL: test_call_v16bf16:
5141 ; GFX9: ; %bb.0: ; %entry
5142 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5143 ; GFX9-NEXT: s_mov_b32 s18, s33
5144 ; GFX9-NEXT: s_mov_b32 s33, s32
5145 ; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
5146 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
5147 ; GFX9-NEXT: s_mov_b64 exec, s[16:17]
5148 ; GFX9-NEXT: s_addk_i32 s32, 0x400
5149 ; GFX9-NEXT: s_getpc_b64 s[16:17]
5150 ; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5151 ; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5152 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
5153 ; GFX9-NEXT: v_writelane_b32 v9, s30, 0
5154 ; GFX9-NEXT: v_writelane_b32 v9, s31, 1
5155 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5156 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
5157 ; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
5158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5159 ; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
5160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5161 ; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
5162 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5163 ; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
5164 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5165 ; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
5166 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5167 ; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
5168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5169 ; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
5170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5171 ; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
5172 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5173 ; GFX9-NEXT: v_readlane_b32 s31, v9, 1
5174 ; GFX9-NEXT: v_readlane_b32 s30, v9, 0
5175 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
5176 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
5177 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
5178 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
5179 ; GFX9-NEXT: s_mov_b32 s33, s18
5180 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5181 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5183 ; GFX10-LABEL: test_call_v16bf16:
5184 ; GFX10: ; %bb.0: ; %entry
5185 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5186 ; GFX10-NEXT: s_mov_b32 s18, s33
5187 ; GFX10-NEXT: s_mov_b32 s33, s32
5188 ; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
5189 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
5190 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5191 ; GFX10-NEXT: s_mov_b32 exec_lo, s16
5192 ; GFX10-NEXT: s_addk_i32 s32, 0x200
5193 ; GFX10-NEXT: s_getpc_b64 s[16:17]
5194 ; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
5195 ; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
5196 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0
5197 ; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
5198 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1
5199 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
5200 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
5201 ; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
5202 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5203 ; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
5204 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5205 ; GFX10-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
5206 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5207 ; GFX10-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
5208 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5209 ; GFX10-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
5210 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5211 ; GFX10-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
5212 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5213 ; GFX10-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
5214 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5215 ; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
5216 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5217 ; GFX10-NEXT: v_readlane_b32 s31, v9, 1
5218 ; GFX10-NEXT: v_readlane_b32 s30, v9, 0
5219 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
5220 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
5221 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5222 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
5223 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
5224 ; GFX10-NEXT: s_mov_b32 s33, s18
5225 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5226 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5228 ; GFX11-LABEL: test_call_v16bf16:
5229 ; GFX11: ; %bb.0: ; %entry
5230 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5231 ; GFX11-NEXT: s_mov_b32 s2, s33
5232 ; GFX11-NEXT: s_mov_b32 s33, s32
5233 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
5234 ; GFX11-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
5235 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
5236 ; GFX11-NEXT: s_add_i32 s32, s32, 16
5237 ; GFX11-NEXT: s_getpc_b64 s[0:1]
5238 ; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
5239 ; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
5240 ; GFX11-NEXT: v_writelane_b32 v9, s30, 0
5241 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
5242 ; GFX11-NEXT: v_writelane_b32 v9, s31, 1
5243 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
5244 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
5245 ; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc
5246 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5247 ; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc
5248 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5249 ; GFX11-NEXT: v_readlane_b32 s31, v9, 1
5250 ; GFX11-NEXT: v_readlane_b32 s30, v9, 0
5251 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
5252 ; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
5253 ; GFX11-NEXT: s_mov_b32 exec_lo, s0
5254 ; GFX11-NEXT: s_add_i32 s32, s32, -16
5255 ; GFX11-NEXT: s_mov_b32 s33, s2
5256 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5257 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5259 %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
5260 store volatile <16 x bfloat> %result, ptr addrspace(5) %out
5264 define bfloat @test_alloca_load_store_ret(bfloat %in) {
5265 ; GCN-LABEL: test_alloca_load_store_ret:
5266 ; GCN: ; %bb.0: ; %entry
5267 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5268 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
5269 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5270 ; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
5271 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5272 ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5273 ; GCN-NEXT: s_waitcnt vmcnt(0)
5274 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
5275 ; GCN-NEXT: s_setpc_b64 s[30:31]
5277 ; GFX7-LABEL: test_alloca_load_store_ret:
5278 ; GFX7: ; %bb.0: ; %entry
5279 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5280 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
5281 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5282 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
5283 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5284 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5285 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5286 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
5287 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5289 ; GFX8-LABEL: test_alloca_load_store_ret:
5290 ; GFX8: ; %bb.0: ; %entry
5291 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5292 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
5293 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5294 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5295 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5296 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5298 ; GFX9-LABEL: test_alloca_load_store_ret:
5299 ; GFX9: ; %bb.0: ; %entry
5300 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5301 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
5302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5303 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
5304 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5305 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5307 ; GFX10-LABEL: test_alloca_load_store_ret:
5308 ; GFX10: ; %bb.0: ; %entry
5309 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5310 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
5311 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5312 ; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc
5313 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5314 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5316 ; GFX11-LABEL: test_alloca_load_store_ret:
5317 ; GFX11: ; %bb.0: ; %entry
5318 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5319 ; GFX11-NEXT: scratch_store_b16 off, v0, s32 dlc
5320 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5321 ; GFX11-NEXT: scratch_load_u16 v0, off, s32 glc dlc
5322 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5323 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5325 %in.addr = alloca bfloat, align 2, addrspace(5)
5326 store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
5327 %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
5331 define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
5332 ; GCN-LABEL: test_overflow_stack:
5334 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5335 ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5336 ; GCN-NEXT: s_waitcnt expcnt(0)
5337 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
5338 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0
5339 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
5340 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32
5341 ; GCN-NEXT: s_waitcnt vmcnt(2)
5342 ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5343 ; GCN-NEXT: s_waitcnt expcnt(0)
5344 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
5345 ; GCN-NEXT: s_waitcnt vmcnt(2)
5346 ; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen
5347 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
5348 ; GCN-NEXT: s_waitcnt vmcnt(2)
5349 ; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen
5350 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
5351 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
5352 ; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
5353 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
5354 ; GCN-NEXT: s_waitcnt expcnt(0)
5355 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0
5356 ; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen
5357 ; GCN-NEXT: s_waitcnt expcnt(0)
5358 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0
5359 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0
5360 ; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
5361 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
5362 ; GCN-NEXT: s_waitcnt expcnt(0)
5363 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
5364 ; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen
5365 ; GCN-NEXT: s_waitcnt expcnt(0)
5366 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0
5367 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0
5368 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
5369 ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
5370 ; GCN-NEXT: s_waitcnt expcnt(0)
5371 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
5372 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
5373 ; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
5374 ; GCN-NEXT: s_waitcnt expcnt(0)
5375 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
5376 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
5377 ; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
5378 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0
5379 ; GCN-NEXT: s_waitcnt expcnt(0)
5380 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0
5381 ; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen
5382 ; GCN-NEXT: s_waitcnt expcnt(0)
5383 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0
5384 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0
5385 ; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen
5386 ; GCN-NEXT: s_waitcnt expcnt(0)
5387 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0
5388 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0
5389 ; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen
5390 ; GCN-NEXT: s_waitcnt expcnt(0)
5391 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0
5392 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
5393 ; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen
5394 ; GCN-NEXT: s_waitcnt expcnt(0)
5395 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0
5396 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0
5397 ; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen
5398 ; GCN-NEXT: s_waitcnt expcnt(0)
5399 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0
5400 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0
5401 ; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen
5402 ; GCN-NEXT: s_waitcnt expcnt(0)
5403 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0
5404 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0
5405 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
5406 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5407 ; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen
5408 ; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
5409 ; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen
5410 ; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen
5411 ; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen
5412 ; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen
5413 ; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
5414 ; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
5415 ; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen
5416 ; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen
5417 ; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen
5418 ; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
5419 ; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen
5420 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
5421 ; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
5422 ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
5423 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5424 ; GCN-NEXT: s_setpc_b64 s[30:31]
5426 ; GFX7-LABEL: test_overflow_stack:
5428 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5429 ; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5430 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
5431 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0
5432 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
5433 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5434 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5435 ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5436 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
5437 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
5438 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5439 ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5440 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32
5441 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0
5442 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5443 ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5444 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
5445 ; GFX7-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
5446 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
5447 ; GFX7-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen
5448 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
5449 ; GFX7-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
5450 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
5451 ; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen
5452 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
5453 ; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen
5454 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
5455 ; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen
5456 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
5457 ; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
5458 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
5459 ; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
5460 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
5461 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
5462 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
5463 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
5464 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
5465 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
5466 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
5467 ; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen
5468 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0
5469 ; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
5470 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0
5471 ; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen
5472 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
5473 ; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
5474 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0
5475 ; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
5476 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0
5477 ; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
5478 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
5479 ; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
5480 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0
5481 ; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
5482 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0
5483 ; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
5484 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0
5485 ; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
5486 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0
5487 ; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
5488 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0
5489 ; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
5490 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0
5491 ; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
5492 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
5493 ; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
5494 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
5495 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
5496 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
5497 ; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5498 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
5499 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
5500 ; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
5501 ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
5502 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5503 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5505 ; GFX8-LABEL: test_overflow_stack:
5507 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5508 ; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5509 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
5510 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0
5511 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5512 ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5513 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
5514 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x78, v0
5515 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5516 ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5517 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32
5518 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x74, v0
5519 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5520 ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
5521 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
5522 ; GFX8-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
5523 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
5524 ; GFX8-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen
5525 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
5526 ; GFX8-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
5527 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
5528 ; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen
5529 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
5530 ; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen
5531 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
5532 ; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen
5533 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
5534 ; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
5535 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
5536 ; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
5537 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
5538 ; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
5539 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
5540 ; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
5541 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
5542 ; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
5543 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
5544 ; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen
5545 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0
5546 ; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
5547 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0
5548 ; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen
5549 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0
5550 ; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
5551 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0
5552 ; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
5553 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0
5554 ; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
5555 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0
5556 ; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
5557 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0
5558 ; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
5559 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0
5560 ; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
5561 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
5562 ; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
5563 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0
5564 ; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
5565 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0
5566 ; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
5567 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0
5568 ; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
5569 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
5570 ; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
5571 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0
5572 ; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
5573 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
5574 ; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5575 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
5576 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
5577 ; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
5578 ; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
5579 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5580 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5582 ; GFX9-LABEL: test_overflow_stack:
5584 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5585 ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
5586 ; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
5587 ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
5588 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
5589 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
5590 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
5591 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
5592 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
5593 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
5594 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
5595 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
5596 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
5597 ; GFX9-NEXT: s_nop 0
5598 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
5599 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
5600 ; GFX9-NEXT: s_nop 0
5601 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
5602 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32
5603 ; GFX9-NEXT: s_nop 0
5604 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
5605 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
5606 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
5607 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
5608 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
5609 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
5610 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
5611 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
5612 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
5613 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
5614 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
5615 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
5616 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
5617 ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
5618 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
5619 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5620 ; GFX9-NEXT: s_waitcnt vmcnt(18)
5621 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
5622 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
5623 ; GFX9-NEXT: s_waitcnt vmcnt(18)
5624 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
5625 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
5626 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5627 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5629 ; GFX10-LABEL: test_overflow_stack:
5631 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5632 ; GFX10-NEXT: s_clause 0x2
5633 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
5634 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
5635 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32
5636 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
5637 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
5638 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
5639 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
5640 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
5641 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
5642 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
5643 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
5644 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
5645 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
5646 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
5647 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
5648 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
5649 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
5650 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
5651 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
5652 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
5653 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
5654 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
5655 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
5656 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
5657 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
5658 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
5659 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
5660 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
5661 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
5662 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
5663 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
5664 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5665 ; GFX10-NEXT: s_waitcnt vmcnt(2)
5666 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
5667 ; GFX10-NEXT: s_waitcnt vmcnt(1)
5668 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
5669 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5670 ; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
5671 ; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
5672 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5674 ; GFX11-LABEL: test_overflow_stack:
5676 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5677 ; GFX11-NEXT: s_clause 0x2
5678 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
5679 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
5680 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
5681 ; GFX11-NEXT: s_clause 0x5
5682 ; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
5683 ; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
5684 ; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
5685 ; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5686 ; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
5687 ; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
5688 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5689 ; GFX11-NEXT: s_clause 0x2
5690 ; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
5691 ; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
5692 ; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128
5693 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5694 %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
5695 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
5696 ret { <32 x i32>, bfloat } %ins.1
5699 define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
5700 ; GCN-LABEL: global_extload_v2bf16_to_v2f32:
5702 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5703 ; GCN-NEXT: s_mov_b32 s6, 0
5704 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5705 ; GCN-NEXT: s_mov_b32 s4, s6
5706 ; GCN-NEXT: s_mov_b32 s5, s6
5707 ; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
5708 ; GCN-NEXT: s_waitcnt vmcnt(0)
5709 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5710 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5711 ; GCN-NEXT: s_setpc_b64 s[30:31]
5713 ; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
5715 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5716 ; GFX7-NEXT: s_mov_b32 s6, 0
5717 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5718 ; GFX7-NEXT: s_mov_b32 s4, s6
5719 ; GFX7-NEXT: s_mov_b32 s5, s6
5720 ; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
5721 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5722 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5723 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5724 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5726 ; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
5728 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5729 ; GFX8-NEXT: flat_load_dword v1, v[0:1]
5730 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5731 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5732 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5733 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5735 ; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
5737 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5738 ; GFX9-NEXT: global_load_dword v1, v[0:1], off
5739 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5740 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5741 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5742 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5744 ; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
5746 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5747 ; GFX10-NEXT: global_load_dword v1, v[0:1], off
5748 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5749 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5750 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5751 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5753 ; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
5755 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5756 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off
5757 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5758 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5759 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5760 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5761 %load = load <2 x bfloat>, ptr addrspace(1) %ptr
5762 %fpext = fpext <2 x bfloat> %load to <2 x float>
5763 ret <2 x float> %fpext
5766 define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
5767 ; GCN-LABEL: global_extload_v3bf16_to_v3f32:
5769 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5770 ; GCN-NEXT: s_mov_b32 s6, 0
5771 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5772 ; GCN-NEXT: s_mov_b32 s4, s6
5773 ; GCN-NEXT: s_mov_b32 s5, s6
5774 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
5775 ; GCN-NEXT: s_waitcnt vmcnt(0)
5776 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5777 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5778 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5779 ; GCN-NEXT: s_setpc_b64 s[30:31]
5781 ; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
5783 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5784 ; GFX7-NEXT: s_mov_b32 s6, 0
5785 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5786 ; GFX7-NEXT: s_mov_b32 s4, s6
5787 ; GFX7-NEXT: s_mov_b32 s5, s6
5788 ; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
5789 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5790 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5791 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5792 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5793 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5795 ; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
5797 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5798 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
5799 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5800 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5801 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5802 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5803 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5805 ; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
5807 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5808 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
5809 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5810 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5811 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5812 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5813 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5815 ; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
5817 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5818 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
5819 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5820 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5821 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5822 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5823 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5825 ; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
5827 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5828 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
5829 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5830 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
5831 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
5832 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
5833 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5834 %load = load <3 x bfloat>, ptr addrspace(1) %ptr
5835 %fpext = fpext <3 x bfloat> %load to <3 x float>
5836 ret <3 x float> %fpext
5839 define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
5840 ; GCN-LABEL: global_extload_v4bf16_to_v4f32:
5842 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5843 ; GCN-NEXT: s_mov_b32 s6, 0
5844 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5845 ; GCN-NEXT: s_mov_b32 s4, s6
5846 ; GCN-NEXT: s_mov_b32 s5, s6
5847 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5848 ; GCN-NEXT: s_waitcnt vmcnt(0)
5849 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5850 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5851 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5852 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5853 ; GCN-NEXT: s_setpc_b64 s[30:31]
5855 ; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
5857 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5858 ; GFX7-NEXT: s_mov_b32 s6, 0
5859 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5860 ; GFX7-NEXT: s_mov_b32 s4, s6
5861 ; GFX7-NEXT: s_mov_b32 s5, s6
5862 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5863 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5864 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5865 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5866 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5867 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5868 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5870 ; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
5872 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5873 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5874 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5875 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5876 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5877 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5878 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5879 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5881 ; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
5883 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5884 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
5885 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5886 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5887 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5888 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5889 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5890 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5892 ; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
5894 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5895 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
5896 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5897 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5898 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5899 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5900 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5901 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5903 ; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
5905 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5906 ; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
5907 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5908 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5909 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5910 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5911 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5912 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5913 %load = load <4 x bfloat>, ptr addrspace(1) %ptr
5914 %fpext = fpext <4 x bfloat> %load to <4 x float>
5915 ret <4 x float> %fpext
5918 define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
5919 ; GCN-LABEL: global_extload_v5bf16_to_v5f32:
5921 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5922 ; GCN-NEXT: s_mov_b32 s6, 0
5923 ; GCN-NEXT: s_mov_b32 s7, 0xf000
5924 ; GCN-NEXT: s_mov_b32 s4, s6
5925 ; GCN-NEXT: s_mov_b32 s5, s6
5926 ; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
5927 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5928 ; GCN-NEXT: s_waitcnt vmcnt(1)
5929 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5930 ; GCN-NEXT: s_waitcnt vmcnt(0)
5931 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5932 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5933 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5934 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5935 ; GCN-NEXT: s_setpc_b64 s[30:31]
5937 ; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
5939 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5940 ; GFX7-NEXT: s_mov_b32 s6, 0
5941 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
5942 ; GFX7-NEXT: s_mov_b32 s4, s6
5943 ; GFX7-NEXT: s_mov_b32 s5, s6
5944 ; GFX7-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
5945 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
5946 ; GFX7-NEXT: s_waitcnt vmcnt(1)
5947 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5948 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5949 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5950 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5951 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5952 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5953 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5955 ; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
5957 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5958 ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
5959 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5960 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5961 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5962 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5963 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5964 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5965 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5967 ; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
5969 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5970 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
5971 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5972 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5973 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5974 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5975 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5976 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5977 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5979 ; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
5981 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5982 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
5983 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5984 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5985 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5986 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5987 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
5988 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5989 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5991 ; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
5993 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5994 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
5995 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5996 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
5997 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
5998 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
5999 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
6000 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
6001 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6002 %load = load <5 x bfloat>, ptr addrspace(1) %ptr
6003 %fpext = fpext <5 x bfloat> %load to <5 x float>
6004 ret <5 x float> %fpext
6007 define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
6008 ; GCN-LABEL: global_extload_v6bf16_to_v6f32:
6010 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6011 ; GCN-NEXT: s_mov_b32 s6, 0
6012 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6013 ; GCN-NEXT: s_mov_b32 s4, s6
6014 ; GCN-NEXT: s_mov_b32 s5, s6
6015 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
6016 ; GCN-NEXT: s_waitcnt vmcnt(0)
6017 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6018 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6019 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6020 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6021 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6022 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6023 ; GCN-NEXT: s_setpc_b64 s[30:31]
6025 ; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
6027 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6028 ; GFX7-NEXT: s_mov_b32 s6, 0
6029 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6030 ; GFX7-NEXT: s_mov_b32 s4, s6
6031 ; GFX7-NEXT: s_mov_b32 s5, s6
6032 ; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
6033 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6034 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6035 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6036 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6037 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6038 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6039 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6040 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6042 ; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
6044 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6045 ; GFX8-NEXT: flat_load_dwordx3 v[3:5], v[0:1]
6046 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6047 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6048 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6049 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6050 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6051 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6052 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6053 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6055 ; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
6057 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6058 ; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
6059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6060 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6061 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6062 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6063 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6064 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6065 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6066 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6068 ; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
6070 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6071 ; GFX10-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
6072 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6073 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6074 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6075 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6076 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6077 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6078 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6079 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6081 ; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
6083 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6084 ; GFX11-NEXT: global_load_b96 v[3:5], v[0:1], off
6085 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6086 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v3
6087 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6088 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4
6089 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
6090 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
6091 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
6092 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6093 %load = load <6 x bfloat>, ptr addrspace(1) %ptr
6094 %fpext = fpext <6 x bfloat> %load to <6 x float>
6095 ret <6 x float> %fpext
6098 define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
6099 ; GCN-LABEL: global_extload_v8bf16_to_v8f32:
6101 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6102 ; GCN-NEXT: s_mov_b32 s6, 0
6103 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6104 ; GCN-NEXT: s_mov_b32 s4, s6
6105 ; GCN-NEXT: s_mov_b32 s5, s6
6106 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6107 ; GCN-NEXT: s_waitcnt vmcnt(0)
6108 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6109 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6110 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6111 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6112 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6113 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6114 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6115 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6116 ; GCN-NEXT: s_setpc_b64 s[30:31]
6118 ; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
6120 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6121 ; GFX7-NEXT: s_mov_b32 s6, 0
6122 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6123 ; GFX7-NEXT: s_mov_b32 s4, s6
6124 ; GFX7-NEXT: s_mov_b32 s5, s6
6125 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6126 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6127 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6128 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6129 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6130 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6131 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6132 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6133 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6134 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6135 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6137 ; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
6139 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6140 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
6141 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6142 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6143 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6144 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6145 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6146 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6147 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6148 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6149 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6150 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6152 ; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
6154 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6155 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6156 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6157 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6158 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6159 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6160 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6161 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6162 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6163 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6164 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6165 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6167 ; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
6169 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6170 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6171 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6172 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6173 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6174 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6175 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6176 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6177 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6178 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6179 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6180 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6182 ; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
6184 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6185 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
6186 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6187 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6188 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6189 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6190 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6191 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6192 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6193 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6194 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6195 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6196 %load = load <8 x bfloat>, ptr addrspace(1) %ptr
6197 %fpext = fpext <8 x bfloat> %load to <8 x float>
6198 ret <8 x float> %fpext
6201 define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
6202 ; GCN-LABEL: global_extload_v16bf16_to_v16f32:
6204 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6205 ; GCN-NEXT: s_mov_b32 s6, 0
6206 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6207 ; GCN-NEXT: s_mov_b32 s4, s6
6208 ; GCN-NEXT: s_mov_b32 s5, s6
6209 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6210 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6211 ; GCN-NEXT: s_waitcnt vmcnt(1)
6212 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6213 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6214 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6215 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6216 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6217 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6218 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6219 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6220 ; GCN-NEXT: s_waitcnt vmcnt(0)
6221 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6222 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6223 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6224 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6225 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6226 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6227 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6228 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6229 ; GCN-NEXT: s_setpc_b64 s[30:31]
6231 ; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
6233 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6234 ; GFX7-NEXT: s_mov_b32 s6, 0
6235 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6236 ; GFX7-NEXT: s_mov_b32 s4, s6
6237 ; GFX7-NEXT: s_mov_b32 s5, s6
6238 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6239 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6240 ; GFX7-NEXT: s_waitcnt vmcnt(1)
6241 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6242 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6243 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6244 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6245 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6246 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6247 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6248 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6249 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6250 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6251 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6252 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6253 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6254 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6255 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6256 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6257 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6258 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6260 ; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
6262 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6263 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
6264 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6265 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6266 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
6267 ; GFX8-NEXT: s_waitcnt vmcnt(1)
6268 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6269 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6270 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6271 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6272 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6273 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6274 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6275 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6276 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6277 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6278 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6279 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6280 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6281 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6282 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6283 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6284 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6285 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6287 ; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
6289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6290 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6291 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6292 ; GFX9-NEXT: s_waitcnt vmcnt(1)
6293 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6294 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6295 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6296 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6297 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6298 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6299 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6300 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6301 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6302 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6303 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6304 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6305 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6306 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6307 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6308 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6309 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6310 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6312 ; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
6314 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6315 ; GFX10-NEXT: s_clause 0x1
6316 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6317 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6318 ; GFX10-NEXT: s_waitcnt vmcnt(1)
6319 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6320 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6321 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6322 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6323 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6324 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6325 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6326 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6327 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6328 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6329 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6330 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6331 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6332 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6333 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6334 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6335 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6336 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6338 ; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
6340 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6341 ; GFX11-NEXT: s_clause 0x1
6342 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
6343 ; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
6344 ; GFX11-NEXT: s_waitcnt vmcnt(1)
6345 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6346 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6347 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6348 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6349 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6350 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6351 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6352 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6353 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6354 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6355 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6356 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6357 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6358 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6359 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6360 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6361 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6362 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6363 %load = load <16 x bfloat>, ptr addrspace(1) %ptr
6364 %fpext = fpext <16 x bfloat> %load to <16 x float>
6365 ret <16 x float> %fpext
6368 define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
6369 ; GCN-LABEL: global_extload_v32bf16_to_v32f32:
6371 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6372 ; GCN-NEXT: s_mov_b32 s6, 0
6373 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6374 ; GCN-NEXT: s_mov_b32 s4, s6
6375 ; GCN-NEXT: s_mov_b32 s5, s6
6376 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6377 ; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6378 ; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
6379 ; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
6380 ; GCN-NEXT: s_waitcnt vmcnt(3)
6381 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6382 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6383 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6384 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6385 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6386 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6387 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6388 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6389 ; GCN-NEXT: s_waitcnt vmcnt(2)
6390 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6391 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6392 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6393 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6394 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6395 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6396 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6397 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6398 ; GCN-NEXT: s_waitcnt vmcnt(1)
6399 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6400 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6401 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6402 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6403 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6404 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6405 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6406 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6407 ; GCN-NEXT: s_waitcnt vmcnt(0)
6408 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6409 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6410 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6411 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6412 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6413 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6414 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6415 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6416 ; GCN-NEXT: s_setpc_b64 s[30:31]
6418 ; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
6420 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6421 ; GFX7-NEXT: s_mov_b32 s6, 0
6422 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6423 ; GFX7-NEXT: s_mov_b32 s4, s6
6424 ; GFX7-NEXT: s_mov_b32 s5, s6
6425 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
6426 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
6427 ; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
6428 ; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
6429 ; GFX7-NEXT: s_waitcnt vmcnt(3)
6430 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6431 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6432 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6433 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6434 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6435 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6436 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6437 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6438 ; GFX7-NEXT: s_waitcnt vmcnt(2)
6439 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6440 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6441 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6442 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6443 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6444 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6445 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6446 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6447 ; GFX7-NEXT: s_waitcnt vmcnt(1)
6448 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6449 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6450 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6451 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6452 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6453 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6454 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6455 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6456 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6457 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6458 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6459 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6460 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6461 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6462 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6463 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6464 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6465 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6467 ; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
6469 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6470 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
6471 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
6472 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
6473 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[2:3]
6474 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
6475 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
6476 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
6477 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6478 ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[2:3]
6479 ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
6480 ; GFX8-NEXT: s_waitcnt vmcnt(3)
6481 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6482 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6483 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6484 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6485 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6486 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6487 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6488 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6489 ; GFX8-NEXT: s_waitcnt vmcnt(2)
6490 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6491 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6492 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6493 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6494 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6495 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6496 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6497 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6498 ; GFX8-NEXT: s_waitcnt vmcnt(1)
6499 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6500 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6501 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6502 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6503 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6504 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6505 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6506 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6507 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6508 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6509 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6510 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6511 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6512 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6513 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6514 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6515 ; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6516 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6518 ; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
6520 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6521 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6522 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6523 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
6524 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
6525 ; GFX9-NEXT: s_waitcnt vmcnt(3)
6526 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6527 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6528 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6529 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6530 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6531 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6532 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6533 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6534 ; GFX9-NEXT: s_waitcnt vmcnt(2)
6535 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6536 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6537 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6538 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6539 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6540 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6541 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6542 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6543 ; GFX9-NEXT: s_waitcnt vmcnt(1)
6544 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6545 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6546 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6547 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6548 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6549 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6550 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6551 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6552 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6553 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6554 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6555 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6556 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6557 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6558 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6559 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6560 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6561 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6563 ; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
6565 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6566 ; GFX10-NEXT: s_clause 0x3
6567 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
6568 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
6569 ; GFX10-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
6570 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
6571 ; GFX10-NEXT: s_waitcnt vmcnt(3)
6572 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6573 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6574 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6575 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6576 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6577 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6578 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6579 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6580 ; GFX10-NEXT: s_waitcnt vmcnt(2)
6581 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6582 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6583 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6584 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6585 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6586 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6587 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6588 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6589 ; GFX10-NEXT: s_waitcnt vmcnt(1)
6590 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6591 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6592 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6593 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6594 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6595 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6596 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6597 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6598 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6599 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6600 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6601 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6602 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6603 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6604 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6605 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6606 ; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6607 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6609 ; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
6611 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6612 ; GFX11-NEXT: s_clause 0x3
6613 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
6614 ; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
6615 ; GFX11-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32
6616 ; GFX11-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48
6617 ; GFX11-NEXT: s_waitcnt vmcnt(3)
6618 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6619 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6620 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
6621 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
6622 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6623 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6624 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
6625 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
6626 ; GFX11-NEXT: s_waitcnt vmcnt(2)
6627 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
6628 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
6629 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
6630 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
6631 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
6632 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
6633 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
6634 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
6635 ; GFX11-NEXT: s_waitcnt vmcnt(1)
6636 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v20
6637 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
6638 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v21
6639 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
6640 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v22
6641 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
6642 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v23
6643 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
6644 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6645 ; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v28
6646 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
6647 ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v29
6648 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
6649 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v30
6650 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
6651 ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31
6652 ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
6653 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6654 %load = load <32 x bfloat>, ptr addrspace(1) %ptr
6655 %fpext = fpext <32 x bfloat> %load to <32 x float>
6656 ret <32 x float> %fpext
6659 define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
6660 ; GCN-LABEL: global_extload_v2bf16_to_v2f64:
6662 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6663 ; GCN-NEXT: s_mov_b32 s6, 0
6664 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6665 ; GCN-NEXT: s_mov_b32 s4, s6
6666 ; GCN-NEXT: s_mov_b32 s5, s6
6667 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
6668 ; GCN-NEXT: s_waitcnt vmcnt(0)
6669 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0
6670 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
6671 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
6672 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6673 ; GCN-NEXT: s_setpc_b64 s[30:31]
6675 ; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
6677 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6678 ; GFX7-NEXT: s_mov_b32 s6, 0
6679 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6680 ; GFX7-NEXT: s_mov_b32 s4, s6
6681 ; GFX7-NEXT: s_mov_b32 s5, s6
6682 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
6683 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6684 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
6685 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
6686 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6687 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6688 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6690 ; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
6692 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6693 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
6694 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6695 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
6696 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
6697 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6698 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6699 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6701 ; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
6703 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6704 ; GFX9-NEXT: global_load_dword v2, v[0:1], off
6705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6706 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
6707 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
6708 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6709 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6710 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6712 ; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
6714 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6715 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
6716 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6717 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
6718 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
6719 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
6720 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6721 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6723 ; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
6725 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6726 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
6727 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6728 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
6729 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
6730 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6731 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
6732 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
6733 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6734 %load = load <2 x bfloat>, ptr addrspace(1) %ptr
6735 %fpext = fpext <2 x bfloat> %load to <2 x double>
6736 ret <2 x double> %fpext
6739 define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
6740 ; GCN-LABEL: global_extload_v3bf16_to_v3f64:
6742 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6743 ; GCN-NEXT: s_mov_b32 s6, 0
6744 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6745 ; GCN-NEXT: s_mov_b32 s4, s6
6746 ; GCN-NEXT: s_mov_b32 s5, s6
6747 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6748 ; GCN-NEXT: s_waitcnt vmcnt(0)
6749 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6750 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6751 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6752 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6753 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6754 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6755 ; GCN-NEXT: s_setpc_b64 s[30:31]
6757 ; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
6759 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6760 ; GFX7-NEXT: s_mov_b32 s6, 0
6761 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6762 ; GFX7-NEXT: s_mov_b32 s4, s6
6763 ; GFX7-NEXT: s_mov_b32 s5, s6
6764 ; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
6765 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6766 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6767 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6768 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6769 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6770 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6771 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6772 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6774 ; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
6776 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6777 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
6778 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6779 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6780 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6781 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6782 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6783 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6784 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6785 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6787 ; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
6789 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6790 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
6791 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6792 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6793 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6794 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6795 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6796 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6797 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6798 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6800 ; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
6802 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6803 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
6804 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6805 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6806 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6807 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6808 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6809 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6810 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6811 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6813 ; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
6815 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6816 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
6817 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6818 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6819 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6820 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6821 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6822 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6823 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6824 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
6825 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6826 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6827 %load = load <3 x bfloat>, ptr addrspace(1) %ptr
6828 %fpext = fpext <3 x bfloat> %load to <3 x double>
6829 ret <3 x double> %fpext
6832 define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
6833 ; GCN-LABEL: global_extload_v4bf16_to_v4f64:
6835 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6836 ; GCN-NEXT: s_mov_b32 s6, 0
6837 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6838 ; GCN-NEXT: s_mov_b32 s4, s6
6839 ; GCN-NEXT: s_mov_b32 s5, s6
6840 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6841 ; GCN-NEXT: s_waitcnt vmcnt(0)
6842 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6843 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6844 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6845 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6846 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6847 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6848 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6849 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6850 ; GCN-NEXT: s_setpc_b64 s[30:31]
6852 ; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
6854 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6855 ; GFX7-NEXT: s_mov_b32 s6, 0
6856 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6857 ; GFX7-NEXT: s_mov_b32 s4, s6
6858 ; GFX7-NEXT: s_mov_b32 s5, s6
6859 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6860 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6861 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6862 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6863 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6864 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6865 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6866 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6867 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6868 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6869 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6871 ; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
6873 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6874 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
6875 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6876 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6877 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6878 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6879 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6880 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6881 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6882 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6883 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6884 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6886 ; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
6888 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6889 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
6890 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6891 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6892 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6893 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
6894 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6895 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
6896 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6897 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6898 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6899 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6901 ; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
6903 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6904 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
6905 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6906 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6907 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6908 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6909 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
6910 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6911 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6912 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6913 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6914 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6916 ; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
6918 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6919 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
6920 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6921 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6922 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6923 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
6924 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
6925 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6926 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
6927 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
6928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6929 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
6930 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6931 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6932 %load = load <4 x bfloat>, ptr addrspace(1) %ptr
6933 %fpext = fpext <4 x bfloat> %load to <4 x double>
6934 ret <4 x double> %fpext
6937 define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
6938 ; GCN-LABEL: global_extload_v5bf16_to_v5f64:
6940 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6941 ; GCN-NEXT: s_mov_b32 s6, 0
6942 ; GCN-NEXT: s_mov_b32 s7, 0xf000
6943 ; GCN-NEXT: s_mov_b32 s4, s6
6944 ; GCN-NEXT: s_mov_b32 s5, s6
6945 ; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
6946 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6947 ; GCN-NEXT: s_waitcnt vmcnt(1)
6948 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
6949 ; GCN-NEXT: s_waitcnt vmcnt(0)
6950 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
6951 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
6952 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
6953 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6954 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
6955 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
6956 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
6957 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
6958 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6959 ; GCN-NEXT: s_setpc_b64 s[30:31]
6961 ; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
6963 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6964 ; GFX7-NEXT: s_mov_b32 s6, 0
6965 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
6966 ; GFX7-NEXT: s_mov_b32 s4, s6
6967 ; GFX7-NEXT: s_mov_b32 s5, s6
6968 ; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
6969 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
6970 ; GFX7-NEXT: s_waitcnt vmcnt(1)
6971 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
6972 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6973 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
6974 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
6975 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
6976 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6977 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
6978 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
6979 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
6980 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
6981 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6982 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6984 ; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
6986 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6987 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
6988 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6989 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
6990 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
6991 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
6992 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
6993 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
6994 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
6995 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
6996 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
6997 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
6998 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
6999 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7001 ; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
7003 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7004 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
7005 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7006 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7007 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7008 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7009 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7010 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7011 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7012 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7013 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7014 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7015 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7016 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7018 ; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
7020 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7021 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
7022 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7023 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7024 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7025 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
7026 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
7027 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
7028 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7029 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7030 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7031 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7032 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7033 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7035 ; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
7037 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7038 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
7039 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7040 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7041 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7042 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v3
7043 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
7044 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v4
7045 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7046 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7047 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7048 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7049 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7050 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7051 %load = load <5 x bfloat>, ptr addrspace(1) %ptr
7052 %fpext = fpext <5 x bfloat> %load to <5 x double>
7053 ret <5 x double> %fpext
7056 define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
7057 ; GCN-LABEL: global_extload_v6bf16_to_v6f64:
7059 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7060 ; GCN-NEXT: s_mov_b32 s6, 0
7061 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7062 ; GCN-NEXT: s_mov_b32 s4, s6
7063 ; GCN-NEXT: s_mov_b32 s5, s6
7064 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7065 ; GCN-NEXT: s_waitcnt vmcnt(0)
7066 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7067 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7068 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7069 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7070 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7071 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7072 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7073 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7074 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7075 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7076 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7077 ; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7078 ; GCN-NEXT: s_setpc_b64 s[30:31]
7080 ; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
7082 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7083 ; GFX7-NEXT: s_mov_b32 s6, 0
7084 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7085 ; GFX7-NEXT: s_mov_b32 s4, s6
7086 ; GFX7-NEXT: s_mov_b32 s5, s6
7087 ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
7088 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7089 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7090 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7091 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7092 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7093 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7094 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7095 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7096 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7097 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7098 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7099 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7100 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7101 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7103 ; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
7105 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7106 ; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
7107 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7108 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7109 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7110 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7111 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7112 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7113 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7114 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7115 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7116 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7117 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7118 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7119 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7120 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7122 ; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
7124 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7125 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
7126 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7127 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
7128 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
7129 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
7130 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
7131 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7132 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7133 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
7134 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
7135 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
7136 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7137 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7138 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7139 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7141 ; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
7143 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7144 ; GFX10-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
7145 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7146 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
7147 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
7148 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
7149 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
7150 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6
7151 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
7152 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7153 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7154 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7155 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7156 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7157 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7158 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7160 ; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
7162 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7163 ; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off
7164 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7165 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
7166 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
7167 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
7168 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
7169 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6
7170 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
7171 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7172 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7173 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7174 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7175 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7176 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7177 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7178 %load = load <6 x bfloat>, ptr addrspace(1) %ptr
7179 %fpext = fpext <6 x bfloat> %load to <6 x double>
7180 ret <6 x double> %fpext
7183 define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
7184 ; GCN-LABEL: global_extload_v8bf16_to_v8f64:
7186 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7187 ; GCN-NEXT: s_mov_b32 s6, 0
7188 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7189 ; GCN-NEXT: s_mov_b32 s4, s6
7190 ; GCN-NEXT: s_mov_b32 s5, s6
7191 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7192 ; GCN-NEXT: s_waitcnt vmcnt(0)
7193 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7194 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7195 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7196 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7197 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7198 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7199 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7200 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7201 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7202 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7203 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7204 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7205 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7206 ; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7207 ; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7208 ; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7209 ; GCN-NEXT: s_setpc_b64 s[30:31]
7211 ; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
7213 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7214 ; GFX7-NEXT: s_mov_b32 s6, 0
7215 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7216 ; GFX7-NEXT: s_mov_b32 s4, s6
7217 ; GFX7-NEXT: s_mov_b32 s5, s6
7218 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
7219 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7220 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7221 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7222 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7223 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7224 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7225 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7226 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7227 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7228 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7229 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7230 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7231 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7232 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7233 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7234 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7235 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7236 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7238 ; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
7240 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7241 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
7242 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7243 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7244 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7245 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7246 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7247 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7248 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7249 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7250 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7251 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7252 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7253 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7254 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7255 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7256 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7257 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7258 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7259 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7261 ; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
7263 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7264 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
7265 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7266 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
7267 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
7268 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v1
7269 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
7270 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
7271 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
7272 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7273 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
7274 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
7275 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
7276 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7277 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7278 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7279 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
7280 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7281 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7282 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7284 ; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
7286 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7287 ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
7288 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7289 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v7
7290 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
7291 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v8
7292 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
7293 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9
7294 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
7295 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v10
7296 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
7297 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7298 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7299 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7300 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7301 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7302 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
7303 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7304 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7305 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7307 ; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
7309 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7310 ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
7311 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7312 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
7313 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
7314 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
7315 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
7316 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
7317 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
7318 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
7319 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
7320 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7321 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7322 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7323 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7324 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7325 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
7326 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7327 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7328 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7329 %load = load <8 x bfloat>, ptr addrspace(1) %ptr
7330 %fpext = fpext <8 x bfloat> %load to <8 x double>
7331 ret <8 x double> %fpext
7334 define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
7335 ; GCN-LABEL: global_extload_v16bf16_to_v16f64:
7337 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7338 ; GCN-NEXT: s_mov_b32 s6, 0
7339 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7340 ; GCN-NEXT: s_mov_b32 s4, s6
7341 ; GCN-NEXT: s_mov_b32 s5, s6
7342 ; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
7343 ; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
7344 ; GCN-NEXT: s_waitcnt vmcnt(1)
7345 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7346 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7347 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7348 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7349 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7350 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7351 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7352 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7353 ; GCN-NEXT: s_waitcnt vmcnt(0)
7354 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7355 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7356 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7357 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7358 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7359 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7360 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7361 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7362 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7363 ; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7364 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7365 ; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7366 ; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7367 ; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7368 ; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7369 ; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7370 ; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7371 ; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7372 ; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7373 ; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7374 ; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7375 ; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7376 ; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7377 ; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7378 ; GCN-NEXT: s_setpc_b64 s[30:31]
7380 ; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
7382 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7383 ; GFX7-NEXT: s_mov_b32 s6, 0
7384 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7385 ; GFX7-NEXT: s_mov_b32 s4, s6
7386 ; GFX7-NEXT: s_mov_b32 s5, s6
7387 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
7388 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
7389 ; GFX7-NEXT: s_waitcnt vmcnt(1)
7390 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7391 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7392 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7393 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7394 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7395 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7396 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7397 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7398 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7399 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7400 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7401 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7402 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7403 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7404 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7405 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7406 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7407 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7408 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7409 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7410 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7411 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7412 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7413 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7414 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7415 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7416 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7417 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7418 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7419 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7420 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7421 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7422 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7423 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7425 ; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
7427 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7428 ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
7429 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7430 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7431 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
7432 ; GFX8-NEXT: s_waitcnt vmcnt(1)
7433 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7434 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7435 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7436 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7437 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7438 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7439 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7440 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7441 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7442 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7443 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7444 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7445 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7446 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7447 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7448 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7449 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7450 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7451 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7452 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7453 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7454 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7455 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7456 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7457 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7458 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7459 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7460 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7461 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7462 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7463 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7464 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7465 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7466 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7468 ; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
7470 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7471 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
7472 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
7473 ; GFX9-NEXT: s_waitcnt vmcnt(1)
7474 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7475 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7476 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3
7477 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
7478 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4
7479 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7480 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7481 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7482 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7483 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6
7484 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
7485 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v7
7486 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
7487 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8
7488 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
7489 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v9
7490 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
7491 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7492 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7493 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
7494 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
7495 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
7496 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7497 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7498 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7499 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7500 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7501 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7502 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7503 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7504 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7505 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7506 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7507 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7509 ; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
7511 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7512 ; GFX10-NEXT: s_clause 0x1
7513 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
7514 ; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off offset:16
7515 ; GFX10-NEXT: s_waitcnt vmcnt(1)
7516 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
7517 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
7518 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
7519 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
7520 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
7521 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
7522 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7523 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
7524 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7525 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v9
7526 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
7527 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v10
7528 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v10
7529 ; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v11
7530 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v11
7531 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12
7532 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v12
7533 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7534 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7535 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
7536 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
7537 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7538 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
7539 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
7540 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
7541 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7542 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7543 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7544 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7545 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7546 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
7547 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7548 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7549 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7551 ; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
7553 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7554 ; GFX11-NEXT: s_clause 0x1
7555 ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
7556 ; GFX11-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16
7557 ; GFX11-NEXT: s_waitcnt vmcnt(1)
7558 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
7559 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
7560 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
7561 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
7562 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
7563 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
7564 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
7565 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
7566 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7567 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v23
7568 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
7569 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v24
7570 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v24
7571 ; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25
7572 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
7573 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v26
7574 ; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v26
7575 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
7576 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
7577 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
7578 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
7579 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
7580 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
7581 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
7582 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
7583 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
7584 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
7585 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7586 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7587 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
7588 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v27
7589 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
7590 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
7591 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7592 %load = load <16 x bfloat>, ptr addrspace(1) %ptr
7593 %fpext = fpext <16 x bfloat> %load to <16 x double>
7594 ret <16 x double> %fpext
7597 define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
7598 ; GCN-LABEL: global_extload_v32bf16_to_v32f64:
7600 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7601 ; GCN-NEXT: s_mov_b32 s6, 0
7602 ; GCN-NEXT: s_mov_b32 s7, 0xf000
7603 ; GCN-NEXT: s_mov_b32 s4, s6
7604 ; GCN-NEXT: s_mov_b32 s5, s6
7605 ; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
7606 ; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
7607 ; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
7608 ; GCN-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
7609 ; GCN-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
7610 ; GCN-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
7611 ; GCN-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
7612 ; GCN-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
7613 ; GCN-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
7614 ; GCN-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
7615 ; GCN-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
7616 ; GCN-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
7617 ; GCN-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
7618 ; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
7619 ; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
7620 ; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
7621 ; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
7622 ; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
7623 ; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
7624 ; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
7625 ; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
7626 ; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
7627 ; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
7628 ; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
7629 ; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
7630 ; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
7631 ; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
7632 ; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
7633 ; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
7634 ; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
7635 ; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
7636 ; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
7637 ; GCN-NEXT: s_waitcnt vmcnt(8)
7638 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
7639 ; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0
7640 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7641 ; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
7642 ; GCN-NEXT: s_waitcnt expcnt(0)
7643 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
7644 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7645 ; GCN-NEXT: s_waitcnt expcnt(0)
7646 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
7647 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xf4, v0
7648 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7649 ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
7650 ; GCN-NEXT: s_waitcnt expcnt(0)
7651 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
7652 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7653 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0
7654 ; GCN-NEXT: s_waitcnt expcnt(0)
7655 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
7656 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7657 ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
7658 ; GCN-NEXT: s_waitcnt expcnt(0)
7659 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
7660 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7661 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0
7662 ; GCN-NEXT: s_waitcnt expcnt(0)
7663 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
7664 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7665 ; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
7666 ; GCN-NEXT: s_waitcnt expcnt(0)
7667 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
7668 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
7669 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xdc, v0
7670 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd8, v0
7671 ; GCN-NEXT: s_waitcnt expcnt(0)
7672 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
7673 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7674 ; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
7675 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd4, v0
7676 ; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
7677 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd0, v0
7678 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xcc, v0
7679 ; GCN-NEXT: s_waitcnt expcnt(0)
7680 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
7681 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7682 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7683 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc8, v0
7684 ; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
7685 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xc4, v0
7686 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xc0, v0
7687 ; GCN-NEXT: s_waitcnt expcnt(0)
7688 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
7689 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7690 ; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
7691 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xbc, v0
7692 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7693 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xb8, v0
7694 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb4, v0
7695 ; GCN-NEXT: s_waitcnt expcnt(0)
7696 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
7697 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7698 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7699 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb0, v0
7700 ; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
7701 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xac, v0
7702 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa8, v0
7703 ; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
7704 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
7705 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7706 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7707 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xa4, v0
7708 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7709 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa0, v0
7710 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x9c, v0
7711 ; GCN-NEXT: s_waitcnt expcnt(0)
7712 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
7713 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7714 ; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
7715 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x98, v0
7716 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
7717 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x94, v0
7718 ; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x90, v0
7719 ; GCN-NEXT: s_waitcnt expcnt(0)
7720 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
7721 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7722 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7723 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x8c, v0
7724 ; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
7725 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x88, v0
7726 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x84, v0
7727 ; GCN-NEXT: s_waitcnt expcnt(0)
7728 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
7729 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7730 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7731 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x80, v0
7732 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7733 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
7734 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
7735 ; GCN-NEXT: s_waitcnt expcnt(0)
7736 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22
7737 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7738 ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
7739 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0
7740 ; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
7741 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x70, v0
7742 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
7743 ; GCN-NEXT: s_waitcnt expcnt(0)
7744 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21
7745 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7746 ; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
7747 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0
7748 ; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
7749 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x64, v0
7750 ; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0
7751 ; GCN-NEXT: s_waitcnt expcnt(0)
7752 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20
7753 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7754 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
7755 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0
7756 ; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
7757 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0
7758 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
7759 ; GCN-NEXT: s_waitcnt expcnt(0)
7760 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19
7761 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7762 ; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
7763 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0
7764 ; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
7765 ; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0
7766 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0
7767 ; GCN-NEXT: s_waitcnt expcnt(0)
7768 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18
7769 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7770 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
7771 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0
7772 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
7773 ; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0
7774 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0
7775 ; GCN-NEXT: s_waitcnt expcnt(0)
7776 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17
7777 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7778 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
7779 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0
7780 ; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
7781 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0
7782 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0
7783 ; GCN-NEXT: s_waitcnt expcnt(0)
7784 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16
7785 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7786 ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
7787 ; GCN-NEXT: v_add_i32_e32 v31, vcc, 44, v0
7788 ; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
7789 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
7790 ; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0
7791 ; GCN-NEXT: s_waitcnt expcnt(0)
7792 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
7793 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7794 ; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
7795 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0
7796 ; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
7797 ; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0
7798 ; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0
7799 ; GCN-NEXT: s_waitcnt expcnt(0)
7800 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14
7801 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7802 ; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
7803 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0
7804 ; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
7805 ; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v0
7806 ; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0
7807 ; GCN-NEXT: s_waitcnt expcnt(0)
7808 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13
7809 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7810 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
7811 ; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0
7812 ; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
7813 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0
7814 ; GCN-NEXT: s_waitcnt expcnt(0)
7815 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12
7816 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
7817 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
7818 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
7819 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
7820 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v4
7821 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
7822 ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v6
7823 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
7824 ; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8
7825 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
7826 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
7827 ; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
7828 ; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
7829 ; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
7830 ; GCN-NEXT: s_waitcnt expcnt(0)
7831 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9
7832 ; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12
7833 ; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
7834 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
7835 ; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v36
7836 ; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
7837 ; GCN-NEXT: s_waitcnt expcnt(0)
7838 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14
7839 ; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15
7840 ; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
7841 ; GCN-NEXT: buffer_store_dword v6, v27, s[0:3], 0 offen
7842 ; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen
7843 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
7844 ; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
7845 ; GCN-NEXT: buffer_store_dword v12, v31, s[0:3], 0 offen
7846 ; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen
7847 ; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen
7848 ; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen
7849 ; GCN-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
7850 ; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen
7851 ; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen
7852 ; GCN-NEXT: buffer_store_dword v3, v24, s[0:3], 0 offen
7853 ; GCN-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen
7854 ; GCN-NEXT: buffer_store_dword v9, v28, s[0:3], 0 offen
7855 ; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen
7856 ; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
7857 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7858 ; GCN-NEXT: s_setpc_b64 s[30:31]
7860 ; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
7862 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7863 ; GFX7-NEXT: s_mov_b32 s6, 0
7864 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
7865 ; GFX7-NEXT: s_mov_b32 s4, s6
7866 ; GFX7-NEXT: s_mov_b32 s5, s6
7867 ; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
7868 ; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
7869 ; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
7870 ; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
7871 ; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
7872 ; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
7873 ; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
7874 ; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
7875 ; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
7876 ; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
7877 ; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
7878 ; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
7879 ; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
7880 ; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
7881 ; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
7882 ; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
7883 ; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
7884 ; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
7885 ; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
7886 ; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
7887 ; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
7888 ; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
7889 ; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
7890 ; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
7891 ; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
7892 ; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
7893 ; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
7894 ; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
7895 ; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
7896 ; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
7897 ; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
7898 ; GFX7-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
7899 ; GFX7-NEXT: s_waitcnt vmcnt(14)
7900 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v20
7901 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
7902 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfc, v0
7903 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7904 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
7905 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7906 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
7907 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
7908 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf4, v0
7909 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xd8, v0
7910 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7911 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
7912 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7913 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23
7914 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
7915 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xec, v0
7916 ; GFX7-NEXT: s_waitcnt vmcnt(14)
7917 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
7918 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7919 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
7920 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7921 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v24
7922 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
7923 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0
7924 ; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd0, v0
7925 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7926 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
7927 ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v25
7928 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7929 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
7930 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0
7931 ; GFX7-NEXT: s_waitcnt vmcnt(8)
7932 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
7933 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7934 ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v26
7935 ; GFX7-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
7936 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
7937 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27
7938 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xd4, v0
7939 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
7940 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7941 ; GFX7-NEXT: buffer_store_dword v20, v24, s[0:3], 0 offen
7942 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28
7943 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7944 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xcc, v0
7945 ; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
7946 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc8, v0
7947 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
7948 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc4, v0
7949 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7950 ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v34
7951 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
7952 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0
7953 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7954 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xbc, v0
7955 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33
7956 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
7957 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
7958 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb8, v0
7959 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v32
7960 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7961 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7962 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0
7963 ; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
7964 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0
7965 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
7966 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xac, v0
7967 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7968 ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v31
7969 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
7970 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0
7971 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7972 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa4, v0
7973 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v30
7974 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
7975 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
7976 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0
7977 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v29
7978 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7979 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
7980 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x9c, v0
7981 ; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
7982 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x98, v0
7983 ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
7984 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x94, v0
7985 ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
7986 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0
7987 ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
7988 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18
7989 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
7990 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x8c, v0
7991 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v19
7992 ; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen
7993 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x88, v0
7994 ; GFX7-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen
7995 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
7996 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15
7997 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
7998 ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x84, v0
7999 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v17
8000 ; GFX7-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen
8001 ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x80, v0
8002 ; GFX7-NEXT: buffer_store_dword v20, v15, s[0:3], 0 offen
8003 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
8004 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8005 ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x7c, v0
8006 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
8007 ; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen
8008 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
8009 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
8010 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v14
8011 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16
8012 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
8013 ; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x74, v0
8014 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
8015 ; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
8016 ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0
8017 ; GFX7-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
8018 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
8019 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
8020 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
8021 ; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x6c, v0
8022 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
8023 ; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen
8024 ; GFX7-NEXT: v_add_i32_e32 v13, vcc, 0x68, v0
8025 ; GFX7-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
8026 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
8027 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
8028 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
8029 ; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x64, v0
8030 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8031 ; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen
8032 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v8
8033 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0
8034 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0
8035 ; GFX7-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
8036 ; GFX7-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen
8037 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
8038 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v4
8039 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
8040 ; GFX7-NEXT: buffer_store_dword v16, v8, s[0:3], 0 offen
8041 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v5
8042 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
8043 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0
8044 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
8045 ; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
8046 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0
8047 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
8048 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8049 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v16
8050 ; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x4c, v0
8051 ; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen
8052 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
8053 ; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8054 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
8055 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0
8056 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
8057 ; GFX7-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
8058 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0
8059 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
8060 ; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
8061 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0
8062 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
8063 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
8064 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 56, v0
8065 ; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8066 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0
8067 ; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
8068 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
8069 ; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
8070 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0
8071 ; GFX7-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen
8072 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
8073 ; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
8074 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0
8075 ; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
8076 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
8077 ; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
8078 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0
8079 ; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
8080 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
8081 ; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
8082 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0
8083 ; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
8084 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
8085 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
8086 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0
8087 ; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen
8088 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
8089 ; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen
8090 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0
8091 ; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
8092 ; GFX7-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen
8093 ; GFX7-NEXT: s_waitcnt vmcnt(0)
8094 ; GFX7-NEXT: s_setpc_b64 s[30:31]
8096 ; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
8098 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8099 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 2, v1
8100 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
8101 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1
8102 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
8103 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 6, v1
8104 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
8105 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 8, v1
8106 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
8107 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1
8108 ; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
8109 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1
8110 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
8111 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1
8112 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
8113 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, 16, v1
8114 ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
8115 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, 18, v1
8116 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
8117 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1
8118 ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
8119 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
8120 ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
8121 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
8122 ; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc
8123 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1
8124 ; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
8125 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1
8126 ; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
8127 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
8128 ; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
8129 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, 32, v1
8130 ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
8131 ; GFX8-NEXT: v_add_u32_e32 v35, vcc, 34, v1
8132 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
8133 ; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
8134 ; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
8135 ; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
8136 ; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
8137 ; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
8138 ; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
8139 ; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
8140 ; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
8141 ; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
8142 ; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
8143 ; GFX8-NEXT: v_add_u32_e32 v37, vcc, 36, v1
8144 ; GFX8-NEXT: flat_load_ushort v43, v[1:2]
8145 ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
8146 ; GFX8-NEXT: v_add_u32_e32 v48, vcc, 38, v1
8147 ; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
8148 ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
8149 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
8150 ; GFX8-NEXT: flat_load_ushort v44, v[50:51]
8151 ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
8152 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
8153 ; GFX8-NEXT: flat_load_ushort v45, v[50:51]
8154 ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1
8155 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
8156 ; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
8157 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
8158 ; GFX8-NEXT: flat_load_ushort v46, v[52:53]
8159 ; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1
8160 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
8161 ; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
8162 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
8163 ; GFX8-NEXT: flat_load_ushort v47, v[54:55]
8164 ; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1
8165 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
8166 ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
8167 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
8168 ; GFX8-NEXT: flat_load_ushort v56, v[39:40]
8169 ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
8170 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
8171 ; GFX8-NEXT: flat_load_ushort v57, v[39:40]
8172 ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 46, v1
8173 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
8174 ; GFX8-NEXT: v_add_u32_e32 v41, vcc, 50, v1
8175 ; GFX8-NEXT: v_addc_u32_e32 v42, vcc, 0, v2, vcc
8176 ; GFX8-NEXT: flat_load_ushort v41, v[41:42]
8177 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v1
8178 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
8179 ; GFX8-NEXT: flat_load_ushort v42, v[9:10]
8180 ; GFX8-NEXT: flat_load_ushort v9, v[35:36]
8181 ; GFX8-NEXT: flat_load_ushort v10, v[37:38]
8182 ; GFX8-NEXT: flat_load_ushort v35, v[48:49]
8183 ; GFX8-NEXT: flat_load_ushort v36, v[50:51]
8184 ; GFX8-NEXT: flat_load_ushort v37, v[52:53]
8185 ; GFX8-NEXT: flat_load_ushort v48, v[54:55]
8186 ; GFX8-NEXT: flat_load_ushort v39, v[39:40]
8187 ; GFX8-NEXT: flat_load_ushort v49, v[1:2]
8188 ; GFX8-NEXT: flat_load_ushort v50, v[3:4]
8189 ; GFX8-NEXT: flat_load_ushort v51, v[5:6]
8190 ; GFX8-NEXT: flat_load_ushort v52, v[7:8]
8191 ; GFX8-NEXT: flat_load_ushort v53, v[11:12]
8192 ; GFX8-NEXT: flat_load_ushort v38, v[13:14]
8193 ; GFX8-NEXT: flat_load_ushort v14, v[17:18]
8194 ; GFX8-NEXT: flat_load_ushort v11, v[21:22]
8195 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v0
8196 ; GFX8-NEXT: flat_load_ushort v15, v[15:16]
8197 ; GFX8-NEXT: flat_load_ushort v13, v[19:20]
8198 ; GFX8-NEXT: flat_load_ushort v8, v[23:24]
8199 ; GFX8-NEXT: flat_load_ushort v6, v[25:26]
8200 ; GFX8-NEXT: flat_load_ushort v5, v[27:28]
8201 ; GFX8-NEXT: flat_load_ushort v7, v[29:30]
8202 ; GFX8-NEXT: flat_load_ushort v12, v[31:32]
8203 ; GFX8-NEXT: flat_load_ushort v16, v[33:34]
8204 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xc4, v0
8205 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xbc, v0
8206 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xb4, v0
8207 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xac, v0
8208 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xa4, v0
8209 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x9c, v0
8210 ; GFX8-NEXT: s_waitcnt vmcnt(14)
8211 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v43
8212 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8213 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
8214 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
8215 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfc, v0
8216 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v44
8217 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
8218 ; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8219 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v45
8220 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8221 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf8, v0
8222 ; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
8223 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf4, v0
8224 ; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
8225 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v46
8226 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8227 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf0, v0
8228 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8229 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0
8230 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
8231 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0
8232 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v47
8233 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
8234 ; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
8235 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0
8236 ; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8237 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0
8238 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v56
8239 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8240 ; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
8241 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0
8242 ; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
8243 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v57
8244 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8245 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd8, v0
8246 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8247 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0
8248 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
8249 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0
8250 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
8251 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v41
8252 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8253 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xcc, v0
8254 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v42
8255 ; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8256 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0
8257 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
8258 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
8259 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v49
8260 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8261 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v50
8262 ; GFX8-NEXT: s_waitcnt vmcnt(14)
8263 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v51
8264 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v52
8265 ; GFX8-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
8266 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0
8267 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8268 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v17
8269 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v39
8270 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
8271 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v53
8272 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v38
8273 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
8274 ; GFX8-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen
8275 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xb8, v0
8276 ; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
8277 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v19
8278 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v48
8279 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
8280 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
8281 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
8282 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
8283 ; GFX8-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
8284 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xb0, v0
8285 ; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
8286 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v21
8287 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v37
8288 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
8289 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
8290 ; GFX8-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen
8291 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa8, v0
8292 ; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
8293 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
8294 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v36
8295 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
8296 ; GFX8-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen
8297 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xa0, v0
8298 ; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen
8299 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v25
8300 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v35
8301 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
8302 ; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
8303 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v10
8304 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x98, v0
8305 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v11
8306 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x94, v0
8307 ; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
8308 ; GFX8-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen
8309 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x90, v0
8310 ; GFX8-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen
8311 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
8312 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v14
8313 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x8c, v0
8314 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v15
8315 ; GFX8-NEXT: buffer_store_dword v28, v14, s[0:3], 0 offen
8316 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x88, v0
8317 ; GFX8-NEXT: buffer_store_dword v27, v14, s[0:3], 0 offen
8318 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v9
8319 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16
8320 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
8321 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v13
8322 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x84, v0
8323 ; GFX8-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen
8324 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x80, v0
8325 ; GFX8-NEXT: buffer_store_dword v27, v13, s[0:3], 0 offen
8326 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
8327 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v12
8328 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v9
8329 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7c, v0
8330 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
8331 ; GFX8-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen
8332 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x78, v0
8333 ; GFX8-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen
8334 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v6
8335 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
8336 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
8337 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x74, v0
8338 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
8339 ; GFX8-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen
8340 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x70, v0
8341 ; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
8342 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
8343 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
8344 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x6c, v0
8345 ; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
8346 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0
8347 ; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
8348 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0
8349 ; GFX8-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen
8350 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0
8351 ; GFX8-NEXT: buffer_store_dword v12, v5, s[0:3], 0 offen
8352 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x5c, v0
8353 ; GFX8-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen
8354 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x58, v0
8355 ; GFX8-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen
8356 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
8357 ; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
8358 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x50, v0
8359 ; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
8360 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0
8361 ; GFX8-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen
8362 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x48, v0
8363 ; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
8364 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0
8365 ; GFX8-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen
8366 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0
8367 ; GFX8-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen
8368 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0
8369 ; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
8370 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0
8371 ; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
8372 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 52, v0
8373 ; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
8374 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 48, v0
8375 ; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
8376 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 44, v0
8377 ; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
8378 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 40, v0
8379 ; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
8380 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 36, v0
8381 ; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
8382 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v0
8383 ; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
8384 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 28, v0
8385 ; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
8386 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 24, v0
8387 ; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
8388 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 20, v0
8389 ; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
8390 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0
8391 ; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
8392 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v0
8393 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
8394 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
8395 ; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
8396 ; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
8397 ; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
8398 ; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
8399 ; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
8400 ; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
8401 ; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
8402 ; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
8403 ; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
8404 ; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
8405 ; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
8406 ; GFX8-NEXT: s_waitcnt vmcnt(0)
8407 ; GFX8-NEXT: s_setpc_b64 s[30:31]
8409 ; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
8411 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8412 ; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:62
8413 ; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:60
8414 ; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:58
8415 ; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:56
8416 ; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:54
8417 ; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:52
8418 ; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:50
8419 ; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:48
8420 ; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:46
8421 ; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:44
8422 ; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:42
8423 ; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:40
8424 ; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:38
8425 ; GFX9-NEXT: global_load_ushort v19, v[1:2], off
8426 ; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:36
8427 ; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:2
8428 ; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:4
8429 ; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:34
8430 ; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:32
8431 ; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:6
8432 ; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:8
8433 ; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:30
8434 ; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
8435 ; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
8436 ; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
8437 ; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
8438 ; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24
8439 ; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:26
8440 ; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:28
8441 ; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:10
8442 ; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
8443 ; GFX9-NEXT: s_nop 0
8444 ; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
8445 ; GFX9-NEXT: s_waitcnt vmcnt(31)
8446 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v21
8447 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
8448 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8449 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
8450 ; GFX9-NEXT: s_waitcnt vmcnt(28)
8451 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v25
8452 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
8453 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
8454 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
8455 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24
8456 ; GFX9-NEXT: s_waitcnt vmcnt(29)
8457 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v26
8458 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
8459 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
8460 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
8461 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8462 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v27
8463 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
8464 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
8465 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
8466 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v24
8467 ; GFX9-NEXT: s_waitcnt vmcnt(31)
8468 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v28
8469 ; GFX9-NEXT: s_waitcnt vmcnt(30)
8470 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v29
8471 ; GFX9-NEXT: s_waitcnt vmcnt(29)
8472 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v30
8473 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
8474 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
8475 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v25
8476 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v26
8477 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
8478 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
8479 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v27
8480 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[27:28], v2
8481 ; GFX9-NEXT: s_waitcnt vmcnt(28)
8482 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19
8483 ; GFX9-NEXT: s_waitcnt vmcnt(27)
8484 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v20
8485 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
8486 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v31
8487 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v32
8488 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v33
8489 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v34
8490 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
8491 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
8492 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v29
8493 ; GFX9-NEXT: s_waitcnt vmcnt(26)
8494 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
8495 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[29:30], v30
8496 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
8497 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
8498 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v31
8499 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[31:32], v32
8500 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
8501 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
8502 ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
8503 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
8504 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
8505 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
8506 ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
8507 ; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
8508 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
8509 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
8510 ; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
8511 ; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
8512 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v17
8513 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
8514 ; GFX9-NEXT: s_waitcnt vmcnt(39)
8515 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
8516 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
8517 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
8518 ; GFX9-NEXT: s_waitcnt vmcnt(40)
8519 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v13
8520 ; GFX9-NEXT: s_waitcnt vmcnt(39)
8521 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v14
8522 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
8523 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
8524 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
8525 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
8526 ; GFX9-NEXT: s_waitcnt vmcnt(40)
8527 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v15
8528 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
8529 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
8530 ; GFX9-NEXT: s_waitcnt vmcnt(34)
8531 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12
8532 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
8533 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
8534 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
8535 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
8536 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
8537 ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
8538 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
8539 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
8540 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
8541 ; GFX9-NEXT: s_waitcnt vmcnt(38)
8542 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9
8543 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
8544 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
8545 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
8546 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
8547 ; GFX9-NEXT: s_waitcnt vmcnt(39)
8548 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7
8549 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
8550 ; GFX9-NEXT: s_waitcnt vmcnt(38)
8551 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1
8552 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5
8553 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
8554 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
8555 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
8556 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
8557 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8558 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v3
8559 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
8560 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8561 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
8562 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
8563 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
8564 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
8565 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v5
8566 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v18
8567 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v21
8568 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v22
8569 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
8570 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v23
8571 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v12
8572 ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
8573 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
8574 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
8575 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
8576 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
8577 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
8578 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
8579 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
8580 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
8581 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
8582 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
8583 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
8584 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
8585 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
8586 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
8587 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
8588 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
8589 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
8590 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
8591 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
8592 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8593 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8595 ; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
8597 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8598 ; GFX10-NEXT: s_clause 0x1f
8599 ; GFX10-NEXT: global_load_ushort v3, v[1:2], off
8600 ; GFX10-NEXT: global_load_ushort v4, v[1:2], off offset:2
8601 ; GFX10-NEXT: global_load_ushort v5, v[1:2], off offset:4
8602 ; GFX10-NEXT: global_load_ushort v6, v[1:2], off offset:6
8603 ; GFX10-NEXT: global_load_ushort v7, v[1:2], off offset:8
8604 ; GFX10-NEXT: global_load_ushort v8, v[1:2], off offset:10
8605 ; GFX10-NEXT: global_load_ushort v9, v[1:2], off offset:12
8606 ; GFX10-NEXT: global_load_ushort v10, v[1:2], off offset:14
8607 ; GFX10-NEXT: global_load_ushort v11, v[1:2], off offset:16
8608 ; GFX10-NEXT: global_load_ushort v12, v[1:2], off offset:18
8609 ; GFX10-NEXT: global_load_ushort v13, v[1:2], off offset:20
8610 ; GFX10-NEXT: global_load_ushort v14, v[1:2], off offset:22
8611 ; GFX10-NEXT: global_load_ushort v15, v[1:2], off offset:24
8612 ; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26
8613 ; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28
8614 ; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30
8615 ; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:32
8616 ; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:34
8617 ; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:36
8618 ; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:38
8619 ; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:40
8620 ; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:42
8621 ; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:44
8622 ; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:46
8623 ; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:48
8624 ; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:62
8625 ; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:50
8626 ; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:52
8627 ; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:54
8628 ; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:60
8629 ; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:56
8630 ; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:58
8631 ; GFX10-NEXT: s_waitcnt vmcnt(31)
8632 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
8633 ; GFX10-NEXT: s_waitcnt vmcnt(30)
8634 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v4
8635 ; GFX10-NEXT: s_waitcnt vmcnt(29)
8636 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v5
8637 ; GFX10-NEXT: s_waitcnt vmcnt(28)
8638 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v6
8639 ; GFX10-NEXT: s_waitcnt vmcnt(27)
8640 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
8641 ; GFX10-NEXT: s_waitcnt vmcnt(26)
8642 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8
8643 ; GFX10-NEXT: s_waitcnt vmcnt(25)
8644 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v9
8645 ; GFX10-NEXT: s_waitcnt vmcnt(24)
8646 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v10
8647 ; GFX10-NEXT: s_waitcnt vmcnt(23)
8648 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11
8649 ; GFX10-NEXT: s_waitcnt vmcnt(22)
8650 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v12
8651 ; GFX10-NEXT: s_waitcnt vmcnt(21)
8652 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v13
8653 ; GFX10-NEXT: s_waitcnt vmcnt(20)
8654 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14
8655 ; GFX10-NEXT: s_waitcnt vmcnt(19)
8656 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v15
8657 ; GFX10-NEXT: s_waitcnt vmcnt(18)
8658 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v16
8659 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v37
8660 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v38
8661 ; GFX10-NEXT: s_waitcnt vmcnt(15)
8662 ; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v19
8663 ; GFX10-NEXT: s_waitcnt vmcnt(14)
8664 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20
8665 ; GFX10-NEXT: s_waitcnt vmcnt(13)
8666 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v21
8667 ; GFX10-NEXT: s_waitcnt vmcnt(12)
8668 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
8669 ; GFX10-NEXT: s_waitcnt vmcnt(11)
8670 ; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v23
8671 ; GFX10-NEXT: s_waitcnt vmcnt(10)
8672 ; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v24
8673 ; GFX10-NEXT: s_waitcnt vmcnt(9)
8674 ; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v25
8675 ; GFX10-NEXT: s_waitcnt vmcnt(8)
8676 ; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v26
8677 ; GFX10-NEXT: s_waitcnt vmcnt(7)
8678 ; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v27
8679 ; GFX10-NEXT: s_waitcnt vmcnt(6)
8680 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v28
8681 ; GFX10-NEXT: s_waitcnt vmcnt(5)
8682 ; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v29
8683 ; GFX10-NEXT: s_waitcnt vmcnt(4)
8684 ; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v30
8685 ; GFX10-NEXT: s_waitcnt vmcnt(3)
8686 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v31
8687 ; GFX10-NEXT: s_waitcnt vmcnt(2)
8688 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v32
8689 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
8690 ; GFX10-NEXT: s_waitcnt vmcnt(0)
8691 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v34
8692 ; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v33
8693 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
8694 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
8695 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v84
8696 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
8697 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
8698 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v50
8699 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v51
8700 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v82
8701 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v52
8702 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v53
8703 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v80
8704 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v35
8705 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
8706 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v48
8707 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v49
8708 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v54
8709 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v55
8710 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v70
8711 ; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v18
8712 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
8713 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
8714 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v83
8715 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v17
8716 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
8717 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
8718 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
8719 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v81
8720 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
8721 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
8722 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v71
8723 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
8724 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
8725 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v65
8726 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[64:65], v64
8727 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
8728 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
8729 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v67
8730 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[66:67], v66
8731 ; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
8732 ; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
8733 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v69
8734 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
8735 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[68:69], v68
8736 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
8737 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
8738 ; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
8739 ; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
8740 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
8741 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
8742 ; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
8743 ; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
8744 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
8745 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
8746 ; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
8747 ; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
8748 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
8749 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
8750 ; GFX10-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
8751 ; GFX10-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
8752 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
8753 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
8754 ; GFX10-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
8755 ; GFX10-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
8756 ; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
8757 ; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
8758 ; GFX10-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
8759 ; GFX10-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
8760 ; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
8761 ; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
8762 ; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
8763 ; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
8764 ; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
8765 ; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
8766 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
8767 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
8768 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
8769 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
8770 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
8771 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
8772 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
8773 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
8774 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
8775 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
8776 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
8777 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
8778 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
8779 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
8780 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
8781 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
8782 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
8783 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
8784 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
8785 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
8786 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
8787 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
8788 ; GFX10-NEXT: s_setpc_b64 s[30:31]
8790 ; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
8792 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8793 ; GFX11-NEXT: s_clause 0x1f
8794 ; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:12
8795 ; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:8
8796 ; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:4
8797 ; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:2
8798 ; GFX11-NEXT: global_load_u16 v7, v[1:2], off
8799 ; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6
8800 ; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10
8801 ; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14
8802 ; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:28
8803 ; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:24
8804 ; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:20
8805 ; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:18
8806 ; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16
8807 ; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22
8808 ; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26
8809 ; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30
8810 ; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:44
8811 ; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:40
8812 ; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:36
8813 ; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:34
8814 ; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32
8815 ; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38
8816 ; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42
8817 ; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46
8818 ; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:60
8819 ; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:56
8820 ; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:52
8821 ; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:50
8822 ; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48
8823 ; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
8824 ; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
8825 ; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
8826 ; GFX11-NEXT: s_waitcnt vmcnt(31)
8827 ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3
8828 ; GFX11-NEXT: s_waitcnt vmcnt(30)
8829 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4
8830 ; GFX11-NEXT: s_waitcnt vmcnt(29)
8831 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
8832 ; GFX11-NEXT: s_waitcnt vmcnt(28)
8833 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6
8834 ; GFX11-NEXT: s_waitcnt vmcnt(27)
8835 ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7
8836 ; GFX11-NEXT: s_waitcnt vmcnt(26)
8837 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8
8838 ; GFX11-NEXT: s_waitcnt vmcnt(25)
8839 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
8840 ; GFX11-NEXT: s_waitcnt vmcnt(24)
8841 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
8842 ; GFX11-NEXT: s_waitcnt vmcnt(23)
8843 ; GFX11-NEXT: v_lshlrev_b32_e32 v102, 16, v11
8844 ; GFX11-NEXT: s_waitcnt vmcnt(22)
8845 ; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v12
8846 ; GFX11-NEXT: s_waitcnt vmcnt(21)
8847 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
8848 ; GFX11-NEXT: s_waitcnt vmcnt(20)
8849 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
8850 ; GFX11-NEXT: s_waitcnt vmcnt(19)
8851 ; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v15
8852 ; GFX11-NEXT: s_waitcnt vmcnt(18)
8853 ; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v16
8854 ; GFX11-NEXT: s_waitcnt vmcnt(17)
8855 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
8856 ; GFX11-NEXT: s_waitcnt vmcnt(16)
8857 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
8858 ; GFX11-NEXT: s_waitcnt vmcnt(15)
8859 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v19
8860 ; GFX11-NEXT: s_waitcnt vmcnt(14)
8861 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20
8862 ; GFX11-NEXT: s_waitcnt vmcnt(13)
8863 ; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
8864 ; GFX11-NEXT: s_waitcnt vmcnt(12)
8865 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
8866 ; GFX11-NEXT: s_waitcnt vmcnt(11)
8867 ; GFX11-NEXT: v_lshlrev_b32_e32 v103, 16, v23
8868 ; GFX11-NEXT: s_waitcnt vmcnt(10)
8869 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24
8870 ; GFX11-NEXT: s_waitcnt vmcnt(9)
8871 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
8872 ; GFX11-NEXT: s_waitcnt vmcnt(8)
8873 ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
8874 ; GFX11-NEXT: s_waitcnt vmcnt(7)
8875 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v27
8876 ; GFX11-NEXT: s_waitcnt vmcnt(6)
8877 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28
8878 ; GFX11-NEXT: s_waitcnt vmcnt(5)
8879 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
8880 ; GFX11-NEXT: s_waitcnt vmcnt(4)
8881 ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
8882 ; GFX11-NEXT: s_waitcnt vmcnt(3)
8883 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31
8884 ; GFX11-NEXT: s_waitcnt vmcnt(2)
8885 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32
8886 ; GFX11-NEXT: s_waitcnt vmcnt(1)
8887 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33
8888 ; GFX11-NEXT: s_waitcnt vmcnt(0)
8889 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8890 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v68
8891 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v65
8892 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64
8893 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33
8894 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1
8895 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v29
8896 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v30
8897 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53
8898 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26
8899 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v52
8900 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25
8901 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v49
8902 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48
8903 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v21
8904 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v34
8905 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v22
8906 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v103
8907 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18
8908 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v102
8909 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17
8910 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v101
8911 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v13
8912 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v14
8913 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v100
8914 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10
8915 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v39
8916 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
8917 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v38
8918 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v6
8919 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
8920 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
8921 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
8922 ; GFX11-NEXT: s_clause 0xf
8923 ; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240
8924 ; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224
8925 ; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208
8926 ; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192
8927 ; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176
8928 ; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
8929 ; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
8930 ; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
8931 ; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
8932 ; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
8933 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
8934 ; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
8935 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
8936 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
8937 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
8938 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
8939 ; GFX11-NEXT: s_setpc_b64 s[30:31]
8940 %load = load <32 x bfloat>, ptr addrspace(1) %ptr
8941 %fpext = fpext <32 x bfloat> %load to <32 x double>
8942 ret <32 x double> %fpext
8945 define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
8946 ; GCN-LABEL: v_fadd_bf16:
8948 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8949 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
8950 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
8951 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
8952 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8953 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1
8954 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8955 ; GCN-NEXT: s_setpc_b64 s[30:31]
8957 ; GFX7-LABEL: v_fadd_bf16:
8959 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8960 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
8961 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
8962 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
8963 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8964 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
8965 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8966 ; GFX7-NEXT: s_setpc_b64 s[30:31]
8968 ; GFX8-LABEL: v_fadd_bf16:
8970 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8971 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8972 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8973 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
8974 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
8975 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
8976 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
8977 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
8978 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
8979 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
8980 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
8981 ; GFX8-NEXT: s_setpc_b64 s[30:31]
8983 ; GFX9-LABEL: v_fadd_bf16:
8985 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8986 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
8987 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
8988 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
8989 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
8990 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
8991 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
8992 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
8993 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
8994 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
8995 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
8996 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8998 ; GFX10-LABEL: v_fadd_bf16:
9000 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9001 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9002 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
9003 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
9004 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
9005 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
9006 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9007 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
9008 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
9009 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9010 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9012 ; GFX11-LABEL: v_fadd_bf16:
9014 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9015 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9016 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
9017 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9018 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
9019 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
9020 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
9021 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9022 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9023 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
9024 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
9025 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9026 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9027 ; GFX11-NEXT: s_setpc_b64 s[30:31]
9028 %op = fadd bfloat %a, %b
9032 define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
9033 ; GCN-LABEL: v_fadd_v2bf16:
9035 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9036 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9037 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9038 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9039 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9040 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9041 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9042 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9043 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9044 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3
9045 ; GCN-NEXT: v_add_f32_e32 v0, v0, v2
9046 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9047 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9048 ; GCN-NEXT: s_setpc_b64 s[30:31]
9050 ; GFX7-LABEL: v_fadd_v2bf16:
9052 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9053 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9054 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9055 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9056 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9057 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9058 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9059 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9060 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9061 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
9062 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
9063 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9064 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9065 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9067 ; GFX8-LABEL: v_fadd_v2bf16:
9069 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9070 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9071 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9072 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
9073 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
9074 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
9075 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9076 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9077 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
9078 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
9079 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
9080 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9081 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
9082 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
9083 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
9084 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
9085 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
9086 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9087 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
9088 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9089 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
9090 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9092 ; GFX9-LABEL: v_fadd_v2bf16:
9094 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9095 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9096 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9097 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
9098 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9099 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9100 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
9101 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9102 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
9103 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
9104 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
9105 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9106 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
9107 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
9108 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
9109 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
9110 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9111 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
9112 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9113 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
9114 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9116 ; GFX10-LABEL: v_fadd_v2bf16:
9118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9119 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9120 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9121 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9122 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9123 ; GFX10-NEXT: v_add_f32_e32 v2, v3, v2
9124 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
9125 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
9126 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
9127 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
9128 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9129 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
9130 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
9131 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
9132 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
9133 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9134 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
9135 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
9136 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9138 ; GFX11-LABEL: v_fadd_v2bf16:
9140 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9141 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
9142 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9143 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
9144 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
9146 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
9147 ; GFX11-NEXT: v_add_f32_e32 v2, v3, v2
9148 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9149 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
9150 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
9151 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
9152 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9153 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
9154 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
9155 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
9156 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9157 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
9158 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9159 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
9160 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9161 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
9162 ; GFX11-NEXT: s_setpc_b64 s[30:31]
9163 %op = fadd <2 x bfloat> %a, %b
9164 ret <2 x bfloat> %op
9167 define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
9168 ; GCN-LABEL: v_fadd_v3bf16:
9170 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9171 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9172 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9173 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9174 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
9175 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9176 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
9177 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9178 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9179 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9180 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9181 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9182 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9183 ; GCN-NEXT: v_add_f32_e32 v2, v2, v5
9184 ; GCN-NEXT: v_add_f32_e32 v1, v1, v4
9185 ; GCN-NEXT: v_add_f32_e32 v0, v0, v3
9186 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9187 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9188 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9189 ; GCN-NEXT: s_setpc_b64 s[30:31]
9191 ; GFX7-LABEL: v_fadd_v3bf16:
9193 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9194 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9195 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9196 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9197 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
9198 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9199 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
9200 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9201 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9202 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9203 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9204 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9205 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9206 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
9207 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
9208 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
9209 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9210 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9211 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9212 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9214 ; GFX8-LABEL: v_fadd_v3bf16:
9216 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9217 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9218 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9219 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
9220 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
9221 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
9222 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
9223 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
9224 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9225 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
9226 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9227 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
9228 ; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
9229 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
9230 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
9231 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
9232 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9233 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9234 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
9235 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
9236 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
9237 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9238 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
9239 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
9240 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
9241 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
9242 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
9243 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9244 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
9245 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9246 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
9247 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
9248 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9250 ; GFX9-LABEL: v_fadd_v3bf16:
9252 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9253 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9254 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9255 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
9256 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
9257 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9258 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
9259 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
9260 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9261 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
9262 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9263 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
9264 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
9265 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9266 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9267 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
9268 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
9269 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
9270 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
9271 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9272 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
9273 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
9274 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
9275 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
9276 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9277 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
9278 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9279 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
9280 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
9281 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9283 ; GFX10-LABEL: v_fadd_v3bf16:
9285 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9286 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
9287 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9288 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9289 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9290 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9291 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
9292 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
9293 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
9294 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
9295 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
9296 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
9297 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
9298 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9299 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
9300 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
9301 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
9302 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
9303 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
9304 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9305 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9306 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9307 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9308 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9309 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
9310 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9311 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
9312 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9314 ; GFX11TRUE16-LABEL: v_fadd_v3bf16:
9315 ; GFX11TRUE16: ; %bb.0:
9316 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9317 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
9318 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9319 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9320 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9321 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9322 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
9323 ; GFX11TRUE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
9324 ; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
9325 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9326 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
9327 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
9328 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9329 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
9330 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
9331 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
9332 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
9333 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
9334 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
9335 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9336 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9337 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9338 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9339 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9340 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9341 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
9342 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9343 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
9344 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
9345 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
9347 ; GFX11FAKE16-LABEL: v_fadd_v3bf16:
9348 ; GFX11FAKE16: ; %bb.0:
9349 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9350 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
9351 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9352 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
9353 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9354 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9355 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
9356 ; GFX11FAKE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
9357 ; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
9358 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9359 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
9360 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
9361 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9362 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
9363 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
9364 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
9365 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
9366 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
9367 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
9368 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9369 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
9370 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9371 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9372 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
9373 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9374 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
9375 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
9376 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
9377 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
9378 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
9379 %op = fadd <3 x bfloat> %a, %b
9380 ret <3 x bfloat> %op
9383 define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
9384 ; GCN-LABEL: v_fadd_v4bf16:
9386 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9387 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9388 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
9389 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9390 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
9391 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9392 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
9393 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9394 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
9395 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9396 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9397 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9398 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9399 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9400 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9401 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9402 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9403 ; GCN-NEXT: v_add_f32_e32 v3, v3, v7
9404 ; GCN-NEXT: v_add_f32_e32 v2, v2, v6
9405 ; GCN-NEXT: v_add_f32_e32 v1, v1, v5
9406 ; GCN-NEXT: v_add_f32_e32 v0, v0, v4
9407 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9408 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9409 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9410 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9411 ; GCN-NEXT: s_setpc_b64 s[30:31]
9413 ; GFX7-LABEL: v_fadd_v4bf16:
9415 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9416 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9417 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
9418 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9419 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
9420 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9421 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
9422 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9423 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
9424 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9425 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9426 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9427 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9428 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9429 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9430 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9431 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9432 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
9433 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
9434 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
9435 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
9436 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9437 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9438 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9439 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9440 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9442 ; GFX8-LABEL: v_fadd_v4bf16:
9444 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9445 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9446 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9447 ; GFX8-NEXT: v_add_f32_e32 v4, v5, v4
9448 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
9449 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
9450 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9451 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9452 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
9453 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
9454 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
9455 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
9456 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
9457 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
9458 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
9459 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
9460 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
9461 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
9462 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9463 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
9464 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9465 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9466 ; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
9467 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
9468 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
9469 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9470 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9471 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
9472 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
9473 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
9474 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9475 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
9476 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
9477 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
9478 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
9479 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
9480 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9481 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
9482 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
9483 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9484 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
9485 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
9486 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9488 ; GFX9-LABEL: v_fadd_v4bf16:
9490 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9491 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9492 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9493 ; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
9494 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9495 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9496 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
9497 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9498 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
9499 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
9500 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
9501 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
9502 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
9503 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
9504 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
9505 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
9506 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9507 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
9508 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
9509 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
9510 ; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
9511 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9512 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9513 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
9514 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
9515 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
9516 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
9517 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9518 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
9519 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
9520 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
9521 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
9522 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9523 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
9524 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9525 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
9526 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
9527 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9529 ; GFX10-LABEL: v_fadd_v4bf16:
9531 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9532 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9533 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9534 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9535 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9536 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
9537 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
9538 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
9539 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9540 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9541 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
9542 ; GFX10-NEXT: v_add_f32_e32 v3, v7, v6
9543 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
9544 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
9545 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
9546 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9547 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
9548 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
9549 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
9550 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
9551 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
9552 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
9553 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
9554 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
9555 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9556 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
9557 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
9558 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
9559 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
9560 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9561 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
9562 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9563 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
9564 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
9565 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
9566 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9568 ; GFX11-LABEL: v_fadd_v4bf16:
9570 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9571 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
9572 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
9573 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9574 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9575 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
9576 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9577 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9578 ; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
9579 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9580 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
9581 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9582 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
9583 ; GFX11-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
9584 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
9585 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
9586 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
9587 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
9588 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
9589 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9590 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
9591 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
9592 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
9593 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
9594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
9595 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
9596 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
9597 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9598 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
9599 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
9600 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
9601 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9602 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9603 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
9604 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9605 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
9606 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
9607 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9608 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
9609 ; GFX11-NEXT: s_setpc_b64 s[30:31]
9610 %op = fadd <4 x bfloat> %a, %b
9611 ret <4 x bfloat> %op
9614 define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
9615 ; GCN-LABEL: v_fadd_v8bf16:
9617 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9618 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
9619 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
9620 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
9621 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
9622 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
9623 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
9624 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
9625 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
9626 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
9627 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
9628 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
9629 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
9630 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
9631 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
9632 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
9633 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
9634 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
9635 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9636 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
9637 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9638 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
9639 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9640 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
9641 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9642 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
9643 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9644 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
9645 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9646 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
9647 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9648 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
9649 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9650 ; GCN-NEXT: v_add_f32_e32 v7, v7, v15
9651 ; GCN-NEXT: v_add_f32_e32 v6, v6, v14
9652 ; GCN-NEXT: v_add_f32_e32 v5, v5, v13
9653 ; GCN-NEXT: v_add_f32_e32 v4, v4, v12
9654 ; GCN-NEXT: v_add_f32_e32 v3, v3, v11
9655 ; GCN-NEXT: v_add_f32_e32 v2, v2, v10
9656 ; GCN-NEXT: v_add_f32_e32 v1, v1, v9
9657 ; GCN-NEXT: v_add_f32_e32 v0, v0, v8
9658 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9659 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9660 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9661 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9662 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9663 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9664 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9665 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9666 ; GCN-NEXT: s_setpc_b64 s[30:31]
9668 ; GFX7-LABEL: v_fadd_v8bf16:
9670 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9671 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
9672 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
9673 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
9674 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
9675 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
9676 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
9677 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
9678 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
9679 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
9680 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
9681 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
9682 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
9683 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
9684 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
9685 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
9686 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
9687 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
9688 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9689 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
9690 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9691 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
9692 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9693 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
9694 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9695 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
9696 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9697 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
9698 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9699 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
9700 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9701 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
9702 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9703 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v15
9704 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v14
9705 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v13
9706 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v12
9707 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v11
9708 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v10
9709 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
9710 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v8
9711 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9712 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9713 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9714 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9715 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9716 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9717 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9718 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9719 ; GFX7-NEXT: s_setpc_b64 s[30:31]
9721 ; GFX8-LABEL: v_fadd_v8bf16:
9723 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9724 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9725 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9726 ; GFX8-NEXT: v_add_f32_e32 v8, v9, v8
9727 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
9728 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
9729 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9730 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9731 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
9732 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v7
9733 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
9734 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
9735 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
9736 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
9737 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
9738 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
9739 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
9740 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
9741 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9742 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
9743 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
9744 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
9745 ; GFX8-NEXT: v_add_f32_e32 v7, v9, v7
9746 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
9747 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
9748 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9749 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9750 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
9751 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
9752 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
9753 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
9754 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
9755 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
9756 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
9757 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
9758 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
9759 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9760 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
9761 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9762 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
9763 ; GFX8-NEXT: v_add_f32_e32 v6, v9, v6
9764 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
9765 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
9766 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9767 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9768 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
9769 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v5
9770 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
9771 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
9772 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
9773 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
9774 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
9775 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
9776 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
9777 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9778 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
9779 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
9780 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
9781 ; GFX8-NEXT: v_add_f32_e32 v5, v9, v5
9782 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
9783 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
9784 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9785 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9786 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
9787 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
9788 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
9789 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
9790 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
9791 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
9792 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
9793 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
9794 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
9795 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9796 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
9797 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
9798 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
9799 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
9800 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
9801 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
9802 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
9803 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
9804 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
9805 ; GFX8-NEXT: s_setpc_b64 s[30:31]
9807 ; GFX9-LABEL: v_fadd_v8bf16:
9809 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9810 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9811 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9812 ; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
9813 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9814 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9815 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
9816 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
9817 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
9818 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
9819 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
9820 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
9821 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
9822 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
9823 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
9824 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
9825 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
9826 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
9827 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
9828 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
9829 ; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
9830 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9831 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9832 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
9833 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
9834 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
9835 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
9836 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
9837 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
9838 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
9839 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
9840 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
9841 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
9842 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
9843 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9844 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
9845 ; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
9846 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9847 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9848 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
9849 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
9850 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
9851 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
9852 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
9853 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
9854 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
9855 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
9856 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
9857 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
9858 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
9859 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
9860 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
9861 ; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
9862 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9863 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9864 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
9865 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
9866 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
9867 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
9868 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
9869 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
9870 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
9871 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
9872 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
9873 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
9874 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
9875 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
9876 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
9877 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
9878 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
9879 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
9880 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9882 ; GFX10-LABEL: v_fadd_v8bf16:
9884 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9885 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9886 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9887 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
9888 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9889 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
9890 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9891 ; GFX10-NEXT: v_add_f32_e32 v8, v9, v8
9892 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
9893 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9894 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
9895 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
9896 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
9897 ; GFX10-NEXT: v_add_f32_e32 v7, v10, v9
9898 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
9899 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9900 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
9901 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
9902 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
9903 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
9904 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9905 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
9906 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
9907 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
9908 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
9909 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
9910 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
9911 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
9912 ; GFX10-NEXT: v_add_f32_e32 v6, v10, v6
9913 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
9914 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9915 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9916 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
9917 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
9918 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
9919 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
9920 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
9921 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9922 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9923 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
9924 ; GFX10-NEXT: v_add_f32_e32 v5, v15, v13
9925 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
9926 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
9927 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
9928 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
9929 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
9930 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
9931 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
9932 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
9933 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
9934 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
9935 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
9936 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
9937 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
9938 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
9939 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
9940 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
9941 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
9942 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
9943 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
9944 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9945 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
9946 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9947 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
9948 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
9949 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9950 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
9951 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
9952 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
9953 ; GFX10-NEXT: s_setpc_b64 s[30:31]
9955 ; GFX11-LABEL: v_fadd_v8bf16:
9957 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9958 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
9959 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
9960 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9961 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
9962 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
9963 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9964 ; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
9965 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
9966 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9967 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9968 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
9969 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9970 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
9971 ; GFX11-NEXT: v_add_f32_e32 v7, v10, v9
9972 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
9973 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
9974 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9975 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
9976 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
9977 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
9978 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
9979 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
9980 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
9981 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
9982 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
9983 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
9984 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
9985 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
9986 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9987 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9988 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
9989 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
9990 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
9991 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
9992 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9993 ; GFX11-NEXT: v_add_f32_e32 v6, v10, v6
9994 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
9995 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9996 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
9997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
9998 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
9999 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
10000 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
10001 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
10002 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
10003 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10004 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10005 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
10006 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10007 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
10008 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
10009 ; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
10010 ; GFX11-NEXT: v_add_f32_e32 v5, v15, v13
10011 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10012 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
10013 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
10014 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
10015 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
10016 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10017 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
10018 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
10019 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
10020 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
10021 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
10022 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
10023 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
10024 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
10025 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
10026 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
10027 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
10028 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10029 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
10030 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
10031 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
10032 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
10033 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
10034 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
10035 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
10036 ; GFX11-NEXT: s_setpc_b64 s[30:31]
10037 %op = fadd <8 x bfloat> %a, %b
10038 ret <8 x bfloat> %op
10041 define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
10042 ; GCN-LABEL: v_fadd_v16bf16:
10044 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10045 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
10046 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
10047 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
10048 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10049 ; GCN-NEXT: v_add_f32_e32 v14, v14, v30
10050 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
10051 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
10052 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
10053 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10054 ; GCN-NEXT: v_add_f32_e32 v13, v13, v29
10055 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
10056 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
10057 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
10058 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10059 ; GCN-NEXT: v_add_f32_e32 v12, v12, v28
10060 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
10061 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
10062 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
10063 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10064 ; GCN-NEXT: v_add_f32_e32 v11, v11, v27
10065 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
10066 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
10067 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
10068 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10069 ; GCN-NEXT: v_add_f32_e32 v10, v10, v26
10070 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
10071 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
10072 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
10073 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10074 ; GCN-NEXT: v_add_f32_e32 v9, v9, v25
10075 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
10076 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
10077 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
10078 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10079 ; GCN-NEXT: v_add_f32_e32 v8, v8, v24
10080 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
10081 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
10082 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
10083 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10084 ; GCN-NEXT: v_add_f32_e32 v7, v7, v23
10085 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
10086 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
10087 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10088 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10089 ; GCN-NEXT: v_add_f32_e32 v6, v6, v22
10090 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
10091 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
10092 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
10093 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10094 ; GCN-NEXT: v_add_f32_e32 v5, v5, v21
10095 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
10096 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
10097 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
10098 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
10099 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
10100 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
10101 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
10102 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
10103 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
10104 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
10105 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
10106 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
10107 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10108 ; GCN-NEXT: v_add_f32_e32 v4, v4, v20
10109 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
10110 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10111 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
10112 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10113 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
10114 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10115 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
10116 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10117 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10118 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10119 ; GCN-NEXT: v_add_f32_e32 v3, v3, v19
10120 ; GCN-NEXT: v_add_f32_e32 v2, v2, v18
10121 ; GCN-NEXT: v_add_f32_e32 v1, v1, v17
10122 ; GCN-NEXT: v_add_f32_e32 v0, v0, v16
10123 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10124 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10125 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10126 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10127 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10128 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10129 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10130 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10131 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10132 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10133 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10134 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10135 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10136 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10137 ; GCN-NEXT: s_waitcnt vmcnt(0)
10138 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
10139 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10140 ; GCN-NEXT: v_add_f32_e32 v15, v15, v16
10141 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10142 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10143 ; GCN-NEXT: s_setpc_b64 s[30:31]
10145 ; GFX7-LABEL: v_fadd_v16bf16:
10147 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10148 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
10149 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
10150 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10151 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10152 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
10153 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
10154 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
10155 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
10156 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
10157 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
10158 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
10159 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
10160 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
10161 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
10162 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
10163 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
10164 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
10165 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
10166 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
10167 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
10168 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
10169 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
10170 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
10171 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
10172 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
10173 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
10174 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
10175 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
10176 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
10177 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
10178 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
10179 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
10180 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
10181 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
10182 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
10183 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
10184 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10185 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
10186 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10187 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
10188 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10189 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
10190 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10191 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
10192 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10193 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
10194 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10195 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
10196 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10197 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
10198 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10199 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10200 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
10201 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10202 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
10203 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10204 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
10205 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10206 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
10207 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10208 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
10209 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10210 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10211 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10212 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
10213 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
10214 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
10215 ; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
10216 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
10217 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
10218 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
10219 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v23
10220 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v21
10221 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v20
10222 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v19
10223 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v18
10224 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v17
10225 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v16
10226 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10227 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10228 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10229 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10230 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10231 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10232 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10233 ; GFX7-NEXT: s_waitcnt vmcnt(0)
10234 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
10235 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10236 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
10237 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10238 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10239 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10240 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10241 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10242 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10243 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10244 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10245 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10246 ; GFX7-NEXT: s_setpc_b64 s[30:31]
10248 ; GFX8-LABEL: v_fadd_v16bf16:
10250 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10251 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10252 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10253 ; GFX8-NEXT: v_add_f32_e32 v16, v17, v16
10254 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
10255 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
10256 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
10257 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10258 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10259 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10260 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v15
10261 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
10262 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
10263 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
10264 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
10265 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
10266 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
10267 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
10268 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
10269 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
10270 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
10271 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
10272 ; GFX8-NEXT: v_add_f32_e32 v15, v17, v15
10273 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
10274 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
10275 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10276 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10277 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10278 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v14
10279 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
10280 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
10281 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
10282 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
10283 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
10284 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
10285 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
10286 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
10287 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
10288 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
10289 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
10290 ; GFX8-NEXT: v_add_f32_e32 v14, v17, v14
10291 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
10292 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
10293 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10294 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10295 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10296 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v13
10297 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
10298 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
10299 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
10300 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
10301 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
10302 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
10303 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
10304 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
10305 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
10306 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
10307 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
10308 ; GFX8-NEXT: v_add_f32_e32 v13, v17, v13
10309 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
10310 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
10311 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10312 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10313 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10314 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v12
10315 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
10316 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
10317 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
10318 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
10319 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
10320 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
10321 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
10322 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
10323 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
10324 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10325 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
10326 ; GFX8-NEXT: v_add_f32_e32 v12, v17, v12
10327 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
10328 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
10329 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10330 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10331 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10332 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v11
10333 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
10334 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
10335 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
10336 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
10337 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
10338 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
10339 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
10340 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
10341 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
10342 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
10343 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
10344 ; GFX8-NEXT: v_add_f32_e32 v11, v17, v11
10345 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
10346 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
10347 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10348 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10349 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10350 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v10
10351 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
10352 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
10353 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
10354 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
10355 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
10356 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
10357 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
10358 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
10359 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
10360 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
10361 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
10362 ; GFX8-NEXT: v_add_f32_e32 v10, v17, v10
10363 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
10364 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
10365 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10366 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10367 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10368 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v9
10369 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
10370 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
10371 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
10372 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
10373 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
10374 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
10375 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
10376 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
10377 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
10378 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
10379 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
10380 ; GFX8-NEXT: v_add_f32_e32 v9, v17, v9
10381 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
10382 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
10383 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10384 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10385 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
10386 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v8
10387 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
10388 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
10389 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
10390 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
10391 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
10392 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
10393 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
10394 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
10395 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
10396 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
10397 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
10398 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
10399 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
10400 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
10401 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
10402 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
10403 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
10404 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
10405 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
10406 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
10407 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
10408 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
10409 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
10410 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
10411 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
10412 ; GFX8-NEXT: s_setpc_b64 s[30:31]
10414 ; GFX9-LABEL: v_fadd_v16bf16:
10416 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10417 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10418 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10419 ; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
10420 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10421 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10422 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
10423 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
10424 ; GFX9-NEXT: v_add_f32_e32 v7, v7, v15
10425 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
10426 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
10427 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
10428 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
10429 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
10430 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
10431 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
10432 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
10433 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
10434 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
10435 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
10436 ; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
10437 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10438 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10439 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
10440 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v14
10441 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
10442 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
10443 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
10444 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
10445 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
10446 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
10447 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
10448 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
10449 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
10450 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
10451 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
10452 ; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
10453 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10454 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10455 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
10456 ; GFX9-NEXT: v_add_f32_e32 v5, v5, v13
10457 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
10458 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
10459 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
10460 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
10461 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
10462 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
10463 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
10464 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
10465 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
10466 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
10467 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
10468 ; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
10469 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10470 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10471 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
10472 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v12
10473 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
10474 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
10475 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
10476 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
10477 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
10478 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
10479 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
10480 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
10481 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
10482 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10483 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
10484 ; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
10485 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10486 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10487 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
10488 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v11
10489 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
10490 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
10491 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
10492 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
10493 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
10494 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
10495 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
10496 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
10497 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
10498 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
10499 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
10500 ; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
10501 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10502 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10503 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
10504 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v10
10505 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
10506 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
10507 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
10508 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
10509 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
10510 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
10511 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
10512 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
10513 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
10514 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
10515 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
10516 ; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
10517 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10518 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10519 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
10520 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v9
10521 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
10522 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
10523 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
10524 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
10525 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
10526 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
10527 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
10528 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
10529 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
10530 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
10531 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
10532 ; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
10533 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10534 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10535 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
10536 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
10537 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
10538 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
10539 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
10540 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
10541 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
10542 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
10543 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
10544 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
10545 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
10546 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
10547 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
10548 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
10549 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
10550 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
10551 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
10552 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
10553 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
10554 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
10555 ; GFX9-NEXT: s_setpc_b64 s[30:31]
10557 ; GFX10-LABEL: v_fadd_v16bf16:
10559 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10560 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10561 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10562 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10563 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10564 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
10565 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10566 ; GFX10-NEXT: v_add_f32_e32 v16, v17, v16
10567 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
10568 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v15
10569 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10570 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
10571 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
10572 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
10573 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
10574 ; GFX10-NEXT: v_add_f32_e32 v17, v18, v17
10575 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
10576 ; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
10577 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
10578 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
10579 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
10580 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
10581 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
10582 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
10583 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
10584 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
10585 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10586 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
10587 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
10588 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
10589 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10590 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10591 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
10592 ; GFX10-NEXT: v_add_f32_e32 v17, v20, v19
10593 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
10594 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v13
10595 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
10596 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
10597 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
10598 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
10599 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
10600 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
10601 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
10602 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10603 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10604 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
10605 ; GFX10-NEXT: v_add_f32_e32 v13, v19, v18
10606 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
10607 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
10608 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10609 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
10610 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
10611 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
10612 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v12
10613 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
10614 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
10615 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10616 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
10617 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
10618 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10619 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
10620 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
10621 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10622 ; GFX10-NEXT: v_add_f32_e32 v12, v18, v12
10623 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
10624 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
10625 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
10626 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
10627 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
10628 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
10629 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
10630 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
10631 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
10632 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
10633 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10634 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
10635 ; GFX10-NEXT: v_add_f32_e32 v18, v19, v18
10636 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10637 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
10638 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
10639 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
10640 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
10641 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
10642 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
10643 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
10644 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
10645 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
10646 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
10647 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
10648 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
10649 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
10650 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
10651 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
10652 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10653 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
10654 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
10655 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
10656 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
10657 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
10658 ; GFX10-NEXT: v_add_f32_e32 v19, v22, v20
10659 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
10660 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
10661 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10662 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10663 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
10664 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
10665 ; GFX10-NEXT: v_add_f32_e32 v9, v22, v20
10666 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
10667 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v8
10668 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
10669 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
10670 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
10671 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
10672 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
10673 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
10674 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
10675 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
10676 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
10677 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
10678 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
10679 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
10680 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
10681 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
10682 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
10683 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
10684 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
10685 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
10686 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
10687 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
10688 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
10689 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
10690 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
10691 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
10692 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
10693 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
10694 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
10695 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
10696 ; GFX10-NEXT: s_setpc_b64 s[30:31]
10698 ; GFX11-LABEL: v_fadd_v16bf16:
10700 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10701 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
10702 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
10703 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
10704 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
10705 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10706 ; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
10707 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
10708 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10709 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
10710 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10711 ; GFX11-NEXT: v_add_f32_e32 v17, v18, v17
10712 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v14
10713 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
10714 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10715 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
10716 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
10717 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
10718 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10719 ; GFX11-NEXT: v_add_f32_e32 v7, v7, v15
10720 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
10721 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
10722 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
10723 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10724 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
10725 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
10726 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
10727 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
10728 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
10729 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
10730 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
10731 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
10732 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
10733 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10734 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
10735 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
10736 ; GFX11-NEXT: v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
10737 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
10738 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
10739 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
10740 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
10741 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
10742 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10743 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
10744 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
10745 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
10746 ; GFX11-NEXT: v_add_f32_e32 v4, v4, v12
10747 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
10748 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
10749 ; GFX11-NEXT: v_add_f32_e32 v5, v5, v13
10750 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
10751 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
10752 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
10753 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
10754 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
10755 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
10756 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
10757 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
10758 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
10759 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
10760 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
10761 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
10762 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
10763 ; GFX11-NEXT: v_add_f32_e32 v12, v18, v12
10764 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10765 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
10766 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
10767 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
10768 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
10769 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10770 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
10771 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
10772 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
10773 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
10774 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
10775 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
10776 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
10777 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
10778 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
10779 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
10780 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
10781 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
10782 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
10783 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
10784 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
10785 ; GFX11-NEXT: v_add_f32_e32 v18, v19, v18
10786 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
10787 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
10788 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
10789 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
10790 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
10791 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
10792 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10793 ; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
10794 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v11
10795 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
10796 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
10797 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
10798 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
10799 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
10800 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
10801 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
10802 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
10803 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
10804 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
10805 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
10806 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
10807 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
10808 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
10809 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
10810 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
10811 ; GFX11-NEXT: v_add_f32_e32 v19, v22, v20
10812 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
10813 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
10814 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
10815 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
10816 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
10817 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
10818 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
10819 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
10820 ; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9
10821 ; GFX11-NEXT: v_add_f32_e32 v9, v22, v20
10822 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10823 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
10824 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
10825 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
10826 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
10827 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
10828 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
10829 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
10830 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
10831 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
10832 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
10833 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
10834 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
10835 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10836 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
10837 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
10838 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
10839 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
10840 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
10841 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
10842 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
10843 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
10844 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
10845 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10846 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
10847 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
10848 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
10849 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
10850 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
10851 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
10852 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
10853 ; GFX11-NEXT: s_setpc_b64 s[30:31]
10854 %op = fadd <16 x bfloat> %a, %b
10855 ret <16 x bfloat> %op
10858 define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
10859 ; GCN-LABEL: v_fadd_v32bf16:
10861 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10862 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
10863 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
10864 ; GCN-NEXT: s_waitcnt vmcnt(1)
10865 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
10866 ; GCN-NEXT: s_waitcnt vmcnt(0)
10867 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
10868 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10869 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
10870 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
10871 ; GCN-NEXT: v_add_f32_e32 v31, v31, v32
10872 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
10873 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
10874 ; GCN-NEXT: s_waitcnt vmcnt(0)
10875 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10876 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10877 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
10878 ; GCN-NEXT: v_add_f32_e32 v30, v30, v32
10879 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
10880 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
10881 ; GCN-NEXT: s_waitcnt vmcnt(0)
10882 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10883 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10884 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
10885 ; GCN-NEXT: v_add_f32_e32 v29, v29, v32
10886 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
10887 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
10888 ; GCN-NEXT: s_waitcnt vmcnt(0)
10889 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10890 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10891 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
10892 ; GCN-NEXT: v_add_f32_e32 v28, v28, v32
10893 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
10894 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
10895 ; GCN-NEXT: s_waitcnt vmcnt(0)
10896 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10897 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10898 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
10899 ; GCN-NEXT: v_add_f32_e32 v27, v27, v32
10900 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
10901 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
10902 ; GCN-NEXT: s_waitcnt vmcnt(0)
10903 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10904 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10905 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
10906 ; GCN-NEXT: v_add_f32_e32 v26, v26, v32
10907 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
10908 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
10909 ; GCN-NEXT: s_waitcnt vmcnt(0)
10910 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10911 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10912 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
10913 ; GCN-NEXT: v_add_f32_e32 v25, v25, v32
10914 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
10915 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
10916 ; GCN-NEXT: s_waitcnt vmcnt(0)
10917 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10918 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10919 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
10920 ; GCN-NEXT: v_add_f32_e32 v24, v24, v32
10921 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
10922 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
10923 ; GCN-NEXT: s_waitcnt vmcnt(0)
10924 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10925 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10926 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
10927 ; GCN-NEXT: v_add_f32_e32 v23, v23, v32
10928 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
10929 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
10930 ; GCN-NEXT: s_waitcnt vmcnt(0)
10931 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10932 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10933 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
10934 ; GCN-NEXT: v_add_f32_e32 v22, v22, v32
10935 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
10936 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
10937 ; GCN-NEXT: s_waitcnt vmcnt(0)
10938 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10939 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10940 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
10941 ; GCN-NEXT: v_add_f32_e32 v21, v21, v32
10942 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
10943 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
10944 ; GCN-NEXT: s_waitcnt vmcnt(0)
10945 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10946 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10947 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
10948 ; GCN-NEXT: v_add_f32_e32 v20, v20, v32
10949 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
10950 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
10951 ; GCN-NEXT: s_waitcnt vmcnt(0)
10952 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10953 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10954 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
10955 ; GCN-NEXT: v_add_f32_e32 v19, v19, v32
10956 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
10957 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
10958 ; GCN-NEXT: s_waitcnt vmcnt(0)
10959 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10960 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10961 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
10962 ; GCN-NEXT: v_add_f32_e32 v18, v18, v32
10963 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
10964 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
10965 ; GCN-NEXT: s_waitcnt vmcnt(0)
10966 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10967 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10968 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
10969 ; GCN-NEXT: v_add_f32_e32 v17, v17, v32
10970 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
10971 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
10972 ; GCN-NEXT: s_waitcnt vmcnt(0)
10973 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10974 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10975 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
10976 ; GCN-NEXT: v_add_f32_e32 v16, v16, v32
10977 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
10978 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
10979 ; GCN-NEXT: s_waitcnt vmcnt(0)
10980 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10981 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10982 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
10983 ; GCN-NEXT: v_add_f32_e32 v15, v15, v32
10984 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
10985 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
10986 ; GCN-NEXT: s_waitcnt vmcnt(0)
10987 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10988 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10989 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
10990 ; GCN-NEXT: v_add_f32_e32 v14, v14, v32
10991 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
10992 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
10993 ; GCN-NEXT: s_waitcnt vmcnt(0)
10994 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
10995 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
10996 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
10997 ; GCN-NEXT: v_add_f32_e32 v13, v13, v32
10998 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
10999 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11000 ; GCN-NEXT: s_waitcnt vmcnt(0)
11001 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11002 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11003 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
11004 ; GCN-NEXT: v_add_f32_e32 v12, v12, v32
11005 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
11006 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11007 ; GCN-NEXT: s_waitcnt vmcnt(0)
11008 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11009 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11010 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
11011 ; GCN-NEXT: v_add_f32_e32 v11, v11, v32
11012 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
11013 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11014 ; GCN-NEXT: s_waitcnt vmcnt(0)
11015 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11016 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11017 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
11018 ; GCN-NEXT: v_add_f32_e32 v10, v10, v32
11019 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
11020 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11021 ; GCN-NEXT: s_waitcnt vmcnt(0)
11022 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11023 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11024 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
11025 ; GCN-NEXT: v_add_f32_e32 v9, v9, v32
11026 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
11027 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11028 ; GCN-NEXT: s_waitcnt vmcnt(0)
11029 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11030 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11031 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
11032 ; GCN-NEXT: v_add_f32_e32 v8, v8, v32
11033 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
11034 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11035 ; GCN-NEXT: s_waitcnt vmcnt(0)
11036 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11037 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11038 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
11039 ; GCN-NEXT: v_add_f32_e32 v7, v7, v32
11040 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
11041 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11042 ; GCN-NEXT: s_waitcnt vmcnt(0)
11043 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11044 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11045 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
11046 ; GCN-NEXT: v_add_f32_e32 v6, v6, v32
11047 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
11048 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11049 ; GCN-NEXT: s_waitcnt vmcnt(0)
11050 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11051 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11052 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
11053 ; GCN-NEXT: v_add_f32_e32 v5, v5, v32
11054 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
11055 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11056 ; GCN-NEXT: s_waitcnt vmcnt(0)
11057 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11058 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11059 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
11060 ; GCN-NEXT: v_add_f32_e32 v4, v4, v32
11061 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
11062 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11063 ; GCN-NEXT: s_waitcnt vmcnt(0)
11064 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11065 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11066 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
11067 ; GCN-NEXT: v_add_f32_e32 v3, v3, v32
11068 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
11069 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11070 ; GCN-NEXT: s_waitcnt vmcnt(0)
11071 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11072 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11073 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
11074 ; GCN-NEXT: v_add_f32_e32 v2, v2, v32
11075 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
11076 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11077 ; GCN-NEXT: s_waitcnt vmcnt(0)
11078 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11079 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11080 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
11081 ; GCN-NEXT: v_add_f32_e32 v1, v1, v32
11082 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
11083 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11084 ; GCN-NEXT: s_waitcnt vmcnt(0)
11085 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
11086 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11087 ; GCN-NEXT: v_add_f32_e32 v0, v0, v32
11088 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11089 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11090 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11091 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11092 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11093 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11094 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11095 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11096 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11097 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11098 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11099 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11100 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11101 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11102 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11103 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11104 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11105 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11106 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11107 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11108 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11109 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11110 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11111 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11112 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11113 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11114 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11115 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11116 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11117 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11118 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11119 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
11120 ; GCN-NEXT: s_setpc_b64 s[30:31]
11122 ; GFX7-LABEL: v_fadd_v32bf16:
11124 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11125 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
11126 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
11127 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
11128 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11129 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
11130 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11131 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
11132 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11133 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
11134 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11135 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
11136 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11137 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
11138 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11139 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
11140 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11141 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
11142 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11143 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
11144 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11145 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
11146 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11147 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
11148 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11149 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
11150 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11151 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
11152 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11153 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
11154 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11155 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
11156 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11157 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
11158 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11159 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
11160 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11161 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
11162 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11163 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
11164 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11165 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
11166 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11167 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
11168 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11169 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
11170 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11171 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
11172 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11173 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
11174 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11175 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
11176 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11177 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
11178 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11179 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
11180 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11181 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
11182 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11183 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
11184 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11185 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
11186 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11187 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
11188 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11189 ; GFX7-NEXT: s_waitcnt vmcnt(1)
11190 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
11191 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11192 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11193 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11194 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
11195 ; GFX7-NEXT: v_add_f32_e32 v31, v31, v32
11196 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
11197 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
11198 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11199 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11200 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11201 ; GFX7-NEXT: v_add_f32_e32 v30, v30, v32
11202 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
11203 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11204 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11205 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11206 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11207 ; GFX7-NEXT: v_add_f32_e32 v29, v29, v32
11208 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
11209 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11210 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11211 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11212 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11213 ; GFX7-NEXT: v_add_f32_e32 v28, v28, v32
11214 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
11215 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11216 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11217 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11218 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11219 ; GFX7-NEXT: v_add_f32_e32 v27, v27, v32
11220 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
11221 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11222 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11223 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11224 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11225 ; GFX7-NEXT: v_add_f32_e32 v26, v26, v32
11226 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
11227 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11228 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11229 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11230 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11231 ; GFX7-NEXT: v_add_f32_e32 v25, v25, v32
11232 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
11233 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11234 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11235 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11236 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11237 ; GFX7-NEXT: v_add_f32_e32 v24, v24, v32
11238 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
11239 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11240 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11241 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11242 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11243 ; GFX7-NEXT: v_add_f32_e32 v23, v23, v32
11244 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
11245 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11246 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11247 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11248 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11249 ; GFX7-NEXT: v_add_f32_e32 v22, v22, v32
11250 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
11251 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11252 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11253 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11254 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11255 ; GFX7-NEXT: v_add_f32_e32 v21, v21, v32
11256 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
11257 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11258 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11259 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11260 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11261 ; GFX7-NEXT: v_add_f32_e32 v20, v20, v32
11262 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
11263 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11264 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11265 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11266 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11267 ; GFX7-NEXT: v_add_f32_e32 v19, v19, v32
11268 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
11269 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11270 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11271 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11272 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11273 ; GFX7-NEXT: v_add_f32_e32 v18, v18, v32
11274 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
11275 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11276 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11277 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11278 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11279 ; GFX7-NEXT: v_add_f32_e32 v17, v17, v32
11280 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
11281 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11282 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11283 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11284 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11285 ; GFX7-NEXT: v_add_f32_e32 v16, v16, v32
11286 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
11287 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11288 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11289 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11290 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11291 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v32
11292 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
11293 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11294 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11295 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11296 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11297 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v32
11298 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
11299 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11300 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11301 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11302 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11303 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v32
11304 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
11305 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11306 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11307 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11308 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11309 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v32
11310 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
11311 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11312 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11313 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11314 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11315 ; GFX7-NEXT: v_add_f32_e32 v11, v11, v32
11316 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
11317 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11318 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11319 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11320 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11321 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v32
11322 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
11323 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11324 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11325 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11326 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11327 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v32
11328 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
11329 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11330 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11331 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11332 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11333 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v32
11334 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
11335 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11336 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11337 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11338 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11339 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v32
11340 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
11341 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11342 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11343 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11344 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11345 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v32
11346 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
11347 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11348 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11349 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11350 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11351 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v32
11352 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
11353 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11354 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11355 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11356 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11357 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v32
11358 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
11359 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11360 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11361 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11362 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11363 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v32
11364 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
11365 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11366 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11367 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11368 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11369 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v32
11370 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
11371 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11372 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11373 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11374 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11375 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v32
11376 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
11377 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11378 ; GFX7-NEXT: s_waitcnt vmcnt(0)
11379 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
11380 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
11381 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v32
11382 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11383 ; GFX7-NEXT: s_setpc_b64 s[30:31]
11385 ; GFX8-LABEL: v_fadd_v32bf16:
11387 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11388 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
11389 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
11390 ; GFX8-NEXT: v_add_f32_e32 v31, v32, v31
11391 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
11392 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
11393 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
11394 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11395 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11396 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
11397 ; GFX8-NEXT: v_add_f32_e32 v14, v14, v30
11398 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
11399 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
11400 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
11401 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
11402 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
11403 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
11404 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
11405 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
11406 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
11407 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
11408 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
11409 ; GFX8-NEXT: v_add_f32_e32 v32, v32, v30
11410 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
11411 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
11412 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11413 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11414 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11415 ; GFX8-NEXT: v_add_f32_e32 v13, v13, v29
11416 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
11417 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
11418 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
11419 ; GFX8-NEXT: s_waitcnt vmcnt(0)
11420 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
11421 ; GFX8-NEXT: v_add_f32_e32 v33, v33, v34
11422 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11423 ; GFX8-NEXT: v_add_f32_e32 v30, v15, v30
11424 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
11425 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
11426 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
11427 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
11428 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
11429 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
11430 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
11431 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
11432 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11433 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
11434 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
11435 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
11436 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
11437 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
11438 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11439 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
11440 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
11441 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
11442 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
11443 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
11444 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
11445 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
11446 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
11447 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
11448 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
11449 ; GFX8-NEXT: v_add_f32_e32 v29, v33, v29
11450 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
11451 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
11452 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11453 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11454 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11455 ; GFX8-NEXT: v_add_f32_e32 v12, v12, v28
11456 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
11457 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
11458 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
11459 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
11460 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
11461 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
11462 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
11463 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
11464 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
11465 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
11466 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
11467 ; GFX8-NEXT: v_add_f32_e32 v28, v33, v28
11468 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
11469 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
11470 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11471 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11472 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11473 ; GFX8-NEXT: v_add_f32_e32 v11, v11, v27
11474 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
11475 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
11476 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
11477 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
11478 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
11479 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
11480 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
11481 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
11482 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
11483 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
11484 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
11485 ; GFX8-NEXT: v_add_f32_e32 v27, v33, v27
11486 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
11487 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
11488 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11489 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11490 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11491 ; GFX8-NEXT: v_add_f32_e32 v10, v10, v26
11492 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
11493 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
11494 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
11495 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
11496 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
11497 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
11498 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
11499 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
11500 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
11501 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
11502 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
11503 ; GFX8-NEXT: v_add_f32_e32 v26, v33, v26
11504 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
11505 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
11506 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11507 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11508 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11509 ; GFX8-NEXT: v_add_f32_e32 v9, v9, v25
11510 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
11511 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
11512 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
11513 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
11514 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
11515 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
11516 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
11517 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
11518 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
11519 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
11520 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
11521 ; GFX8-NEXT: v_add_f32_e32 v25, v33, v25
11522 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
11523 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
11524 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11525 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11526 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11527 ; GFX8-NEXT: v_add_f32_e32 v8, v8, v24
11528 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
11529 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
11530 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
11531 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
11532 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
11533 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
11534 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
11535 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
11536 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
11537 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
11538 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
11539 ; GFX8-NEXT: v_add_f32_e32 v24, v33, v24
11540 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
11541 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
11542 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11543 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11544 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11545 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v23
11546 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
11547 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
11548 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
11549 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
11550 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
11551 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
11552 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
11553 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
11554 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
11555 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
11556 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
11557 ; GFX8-NEXT: v_add_f32_e32 v23, v33, v23
11558 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
11559 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
11560 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11561 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11562 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11563 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v22
11564 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
11565 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
11566 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
11567 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
11568 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
11569 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
11570 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
11571 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
11572 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
11573 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
11574 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
11575 ; GFX8-NEXT: v_add_f32_e32 v22, v33, v22
11576 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
11577 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
11578 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11579 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11580 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11581 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v21
11582 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
11583 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
11584 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
11585 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
11586 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
11587 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
11588 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
11589 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
11590 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
11591 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
11592 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
11593 ; GFX8-NEXT: v_add_f32_e32 v21, v33, v21
11594 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
11595 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
11596 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11597 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11598 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11599 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v20
11600 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
11601 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
11602 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
11603 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
11604 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
11605 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
11606 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
11607 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
11608 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
11609 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
11610 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
11611 ; GFX8-NEXT: v_add_f32_e32 v20, v33, v20
11612 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
11613 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
11614 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11615 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11616 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11617 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v19
11618 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
11619 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
11620 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
11621 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
11622 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
11623 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
11624 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
11625 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
11626 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
11627 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
11628 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
11629 ; GFX8-NEXT: v_add_f32_e32 v19, v33, v19
11630 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
11631 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
11632 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11633 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11634 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11635 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v18
11636 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
11637 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
11638 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
11639 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
11640 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
11641 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
11642 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
11643 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
11644 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
11645 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
11646 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
11647 ; GFX8-NEXT: v_add_f32_e32 v18, v33, v18
11648 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
11649 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
11650 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11651 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11652 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11653 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v17
11654 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
11655 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
11656 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
11657 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
11658 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
11659 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
11660 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
11661 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
11662 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
11663 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
11664 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
11665 ; GFX8-NEXT: v_add_f32_e32 v17, v33, v17
11666 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
11667 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
11668 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11669 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11670 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
11671 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v16
11672 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
11673 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
11674 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
11675 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
11676 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
11677 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
11678 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
11679 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
11680 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
11681 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
11682 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
11683 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
11684 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
11685 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
11686 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
11687 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
11688 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
11689 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
11690 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
11691 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
11692 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
11693 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
11694 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
11695 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
11696 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
11697 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
11698 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
11699 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
11700 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
11701 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
11702 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
11703 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
11704 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
11705 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
11706 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
11707 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
11708 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
11709 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
11710 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
11711 ; GFX8-NEXT: s_setpc_b64 s[30:31]
11713 ; GFX9-LABEL: v_fadd_v32bf16:
11715 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11716 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
11717 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
11718 ; GFX9-NEXT: v_add_f32_e32 v31, v32, v31
11719 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
11720 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
11721 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
11722 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
11723 ; GFX9-NEXT: v_add_f32_e32 v14, v14, v30
11724 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
11725 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
11726 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
11727 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
11728 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
11729 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
11730 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
11731 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
11732 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
11733 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
11734 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
11735 ; GFX9-NEXT: v_add_f32_e32 v30, v32, v30
11736 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11737 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
11738 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
11739 ; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
11740 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
11741 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
11742 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
11743 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
11744 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
11745 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
11746 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
11747 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
11748 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
11749 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
11750 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
11751 ; GFX9-NEXT: v_add_f32_e32 v32, v32, v29
11752 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
11753 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
11754 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
11755 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
11756 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
11757 ; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
11758 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
11759 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
11760 ; GFX9-NEXT: s_waitcnt vmcnt(0)
11761 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
11762 ; GFX9-NEXT: v_add_f32_e32 v33, v33, v34
11763 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
11764 ; GFX9-NEXT: v_add_f32_e32 v29, v15, v29
11765 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
11766 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
11767 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
11768 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
11769 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
11770 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
11771 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
11772 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
11773 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
11774 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
11775 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
11776 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
11777 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
11778 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
11779 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
11780 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
11781 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
11782 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
11783 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
11784 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
11785 ; GFX9-NEXT: v_add_f32_e32 v28, v33, v28
11786 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
11787 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
11788 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
11789 ; GFX9-NEXT: v_add_f32_e32 v11, v11, v27
11790 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
11791 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
11792 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
11793 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
11794 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
11795 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
11796 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
11797 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
11798 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
11799 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
11800 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
11801 ; GFX9-NEXT: v_add_f32_e32 v27, v33, v27
11802 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
11803 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
11804 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
11805 ; GFX9-NEXT: v_add_f32_e32 v10, v10, v26
11806 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
11807 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
11808 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
11809 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
11810 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
11811 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
11812 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
11813 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
11814 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
11815 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
11816 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
11817 ; GFX9-NEXT: v_add_f32_e32 v26, v33, v26
11818 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
11819 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
11820 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
11821 ; GFX9-NEXT: v_add_f32_e32 v9, v9, v25
11822 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
11823 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
11824 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
11825 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
11826 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
11827 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
11828 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
11829 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
11830 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
11831 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
11832 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
11833 ; GFX9-NEXT: v_add_f32_e32 v25, v33, v25
11834 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
11835 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
11836 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
11837 ; GFX9-NEXT: v_add_f32_e32 v8, v8, v24
11838 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
11839 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
11840 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
11841 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
11842 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
11843 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
11844 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
11845 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
11846 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
11847 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
11848 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
11849 ; GFX9-NEXT: v_add_f32_e32 v24, v33, v24
11850 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
11851 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
11852 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
11853 ; GFX9-NEXT: v_add_f32_e32 v7, v7, v23
11854 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
11855 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
11856 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
11857 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
11858 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
11859 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
11860 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
11861 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
11862 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
11863 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
11864 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
11865 ; GFX9-NEXT: v_add_f32_e32 v23, v33, v23
11866 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
11867 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
11868 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
11869 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v22
11870 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
11871 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
11872 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
11873 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
11874 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
11875 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
11876 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
11877 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
11878 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
11879 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
11880 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
11881 ; GFX9-NEXT: v_add_f32_e32 v22, v33, v22
11882 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
11883 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
11884 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
11885 ; GFX9-NEXT: v_add_f32_e32 v5, v5, v21
11886 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
11887 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
11888 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
11889 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
11890 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
11891 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
11892 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
11893 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
11894 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
11895 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
11896 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
11897 ; GFX9-NEXT: v_add_f32_e32 v21, v33, v21
11898 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
11899 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
11900 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
11901 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v20
11902 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
11903 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
11904 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
11905 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
11906 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
11907 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
11908 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
11909 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
11910 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
11911 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
11912 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
11913 ; GFX9-NEXT: v_add_f32_e32 v20, v33, v20
11914 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
11915 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
11916 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
11917 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v19
11918 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
11919 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
11920 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
11921 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
11922 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
11923 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
11924 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
11925 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
11926 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
11927 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
11928 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
11929 ; GFX9-NEXT: v_add_f32_e32 v19, v33, v19
11930 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
11931 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
11932 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
11933 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v18
11934 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
11935 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
11936 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
11937 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
11938 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
11939 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
11940 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
11941 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
11942 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
11943 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
11944 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
11945 ; GFX9-NEXT: v_add_f32_e32 v18, v33, v18
11946 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
11947 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
11948 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
11949 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v17
11950 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
11951 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
11952 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
11953 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
11954 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
11955 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
11956 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
11957 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
11958 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
11959 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
11960 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
11961 ; GFX9-NEXT: v_add_f32_e32 v17, v33, v17
11962 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
11963 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
11964 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
11965 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v16
11966 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
11967 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
11968 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
11969 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
11970 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
11971 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
11972 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
11973 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
11974 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
11975 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
11976 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
11977 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
11978 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
11979 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
11980 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
11981 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
11982 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
11983 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
11984 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
11985 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
11986 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
11987 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
11988 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
11989 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
11990 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
11991 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
11992 ; GFX9-NEXT: s_setpc_b64 s[30:31]
11994 ; GFX10-LABEL: v_fadd_v32bf16:
11996 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11997 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
11998 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
11999 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
12000 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
12001 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
12002 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
12003 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
12004 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
12005 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
12006 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
12007 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
12008 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
12009 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
12010 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
12011 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
12012 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
12013 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
12014 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
12015 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
12016 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
12017 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
12018 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
12019 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
12020 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
12021 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
12022 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
12023 ; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
12024 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
12025 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
12026 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
12027 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
12028 ; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
12029 ; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
12030 ; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
12031 ; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
12032 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
12033 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
12034 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
12035 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
12036 ; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
12037 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
12038 ; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
12039 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
12040 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
12041 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12042 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
12043 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
12044 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
12045 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12046 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
12047 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
12048 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
12049 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12050 ; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
12051 ; GFX10-NEXT: v_add_f32_e32 v25, v54, v53
12052 ; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
12053 ; GFX10-NEXT: v_add_f32_e32 v24, v64, v55
12054 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
12055 ; GFX10-NEXT: v_add_f32_e32 v23, v66, v65
12056 ; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
12057 ; GFX10-NEXT: v_add_f32_e32 v22, v68, v67
12058 ; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
12059 ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
12060 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
12061 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
12062 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
12063 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
12064 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
12065 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
12066 ; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
12067 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
12068 ; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
12069 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
12070 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
12071 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12072 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
12073 ; GFX10-NEXT: v_add_f32_e32 v18, v27, v48
12074 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
12075 ; GFX10-NEXT: v_add_f32_e32 v17, v26, v50
12076 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
12077 ; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
12078 ; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
12079 ; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
12080 ; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
12081 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
12082 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
12083 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
12084 ; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
12085 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
12086 ; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
12087 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
12088 ; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
12089 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
12090 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12091 ; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
12092 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
12093 ; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
12094 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
12095 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
12096 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12097 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
12098 ; GFX10-NEXT: v_add_f32_e32 v19, v28, v38
12099 ; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
12100 ; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
12101 ; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
12102 ; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
12103 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
12104 ; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
12105 ; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
12106 ; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
12107 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
12108 ; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
12109 ; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
12110 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
12111 ; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
12112 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
12113 ; GFX10-NEXT: v_add_f32_e32 v20, v29, v36
12114 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
12115 ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
12116 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
12117 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
12118 ; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
12119 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
12120 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
12121 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
12122 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
12123 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
12124 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
12125 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
12126 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
12127 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
12128 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
12129 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
12130 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
12131 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
12132 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
12133 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
12134 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
12135 ; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
12136 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
12137 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
12138 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
12139 ; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
12140 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
12141 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
12142 ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
12143 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
12144 ; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
12145 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
12146 ; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
12147 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
12148 ; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
12149 ; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
12150 ; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
12151 ; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
12152 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
12153 ; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
12154 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
12155 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
12156 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
12157 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
12158 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
12159 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
12160 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
12161 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
12162 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
12163 ; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
12164 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
12165 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
12166 ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
12167 ; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
12168 ; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
12169 ; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
12170 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
12171 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
12172 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
12173 ; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
12174 ; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
12175 ; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
12176 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
12177 ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
12178 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
12179 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
12180 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
12181 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
12182 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
12183 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
12184 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
12185 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
12186 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
12187 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
12188 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
12189 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
12190 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
12191 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
12192 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
12193 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
12194 ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
12195 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
12196 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
12197 ; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
12198 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
12199 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
12200 ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
12201 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
12202 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
12203 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
12204 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
12205 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
12206 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
12207 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
12208 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
12209 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
12210 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
12211 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
12212 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
12213 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
12214 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
12215 ; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
12216 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
12217 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
12218 ; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
12219 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
12220 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
12221 ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
12222 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
12223 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
12224 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
12225 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
12226 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
12227 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
12228 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
12229 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
12230 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
12231 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
12232 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
12233 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
12234 ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
12235 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
12236 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
12237 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
12238 ; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
12239 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
12240 ; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
12241 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
12242 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
12243 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
12244 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
12245 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
12246 ; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
12247 ; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
12248 ; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
12249 ; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
12250 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
12251 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
12252 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
12253 ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
12254 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
12255 ; GFX10-NEXT: s_waitcnt vmcnt(0)
12256 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
12257 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
12258 ; GFX10-NEXT: v_add_f32_e32 v17, v31, v17
12259 ; GFX10-NEXT: v_add_f32_e32 v15, v15, v18
12260 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
12261 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
12262 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
12263 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
12264 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12265 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
12266 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
12267 ; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
12268 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
12269 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
12270 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
12271 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12273 ; GFX11-LABEL: v_fadd_v32bf16:
12275 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12276 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
12277 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
12278 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
12279 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
12280 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12281 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
12282 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
12283 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
12284 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12285 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
12286 ; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
12287 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
12288 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12289 ; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
12290 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
12291 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
12292 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
12293 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
12294 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
12295 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
12296 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
12297 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
12298 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
12299 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12300 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
12301 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
12302 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
12303 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12304 ; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
12305 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
12306 ; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
12307 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
12308 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
12309 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
12310 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12311 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
12312 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
12313 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
12314 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
12315 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
12316 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
12317 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
12318 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
12319 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12320 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
12321 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
12322 ; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
12323 ; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
12324 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
12325 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
12326 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12327 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
12328 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
12329 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
12330 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
12331 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
12332 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
12333 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
12334 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
12335 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
12336 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
12337 ; GFX11-NEXT: v_add_f32_e32 v2, v2, v18
12338 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v16
12339 ; GFX11-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
12340 ; GFX11-NEXT: v_add_f32_e32 v7, v7, v23
12341 ; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
12342 ; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
12343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
12344 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
12345 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
12346 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
12347 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
12348 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
12349 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
12350 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
12351 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
12352 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
12353 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12354 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
12355 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
12356 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
12357 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
12358 ; GFX11-NEXT: v_add_f32_e32 v4, v4, v20
12359 ; GFX11-NEXT: v_add_f32_e32 v20, v80, v71
12360 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
12361 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
12362 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
12363 ; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
12364 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12365 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
12366 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
12367 ; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
12368 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
12369 ; GFX11-NEXT: v_add_f32_e32 v26, v52, v51
12370 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12371 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v22
12372 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
12373 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
12374 ; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
12375 ; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
12376 ; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
12377 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
12378 ; GFX11-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
12379 ; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
12380 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12381 ; GFX11-NEXT: v_add_f32_e32 v29, v38, v37
12382 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
12383 ; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
12384 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
12385 ; GFX11-NEXT: v_add_f32_e32 v14, v14, v30
12386 ; GFX11-NEXT: v_add_f32_e32 v28, v48, v39
12387 ; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
12388 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
12389 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
12390 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
12391 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
12392 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
12393 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
12394 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
12395 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
12396 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
12397 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
12398 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
12399 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
12400 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
12401 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
12402 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
12403 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
12404 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
12405 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
12406 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
12407 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
12408 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
12409 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
12410 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
12411 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
12412 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
12413 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
12414 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
12415 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
12416 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
12417 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
12418 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
12419 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
12420 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
12421 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
12422 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
12423 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
12424 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
12425 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
12426 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
12427 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
12428 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
12429 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
12430 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
12431 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
12432 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
12433 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
12434 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
12435 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
12436 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
12437 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
12438 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
12439 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
12440 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
12441 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
12442 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
12443 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
12444 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
12445 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
12446 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
12447 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
12448 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
12449 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
12450 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
12451 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
12452 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
12453 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
12454 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
12455 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
12456 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
12457 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
12458 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
12459 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
12460 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
12461 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
12462 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
12463 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
12464 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
12465 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
12466 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
12467 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
12468 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
12469 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
12470 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
12471 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
12472 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
12473 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
12474 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
12475 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
12476 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
12477 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
12478 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
12479 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
12480 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
12481 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
12482 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
12483 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
12484 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
12485 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
12486 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
12487 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
12488 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
12489 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
12490 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
12491 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
12492 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12493 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
12494 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
12495 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
12496 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
12497 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
12498 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
12499 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
12500 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
12501 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
12502 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
12503 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12504 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
12505 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
12506 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
12507 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
12508 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
12509 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
12510 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
12511 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
12512 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
12513 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
12514 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
12515 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
12516 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
12517 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12518 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
12519 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
12520 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
12521 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12522 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
12523 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12524 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
12525 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
12526 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
12527 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12528 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
12529 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
12530 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
12531 ; GFX11-NEXT: s_waitcnt vmcnt(0)
12532 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
12533 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12534 ; GFX11-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
12535 ; GFX11-NEXT: v_add_f32_e32 v15, v15, v18
12536 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12537 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
12538 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
12539 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
12540 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
12541 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
12542 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
12543 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
12544 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12545 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
12546 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
12547 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
12548 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12549 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
12550 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12551 %op = fadd <32 x bfloat> %a, %b
12552 ret <32 x bfloat> %op
12555 define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
12556 ; GCN-LABEL: v_fadd_bf16_fpimm_0:
12558 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12559 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12560 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12561 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
12562 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12563 ; GCN-NEXT: s_setpc_b64 s[30:31]
12565 ; GFX7-LABEL: v_fadd_bf16_fpimm_0:
12567 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12568 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12569 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12570 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
12571 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12572 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12574 ; GFX8-LABEL: v_fadd_bf16_fpimm_0:
12576 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12577 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12578 ; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0
12579 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12580 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12581 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12582 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
12583 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12584 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12585 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12586 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12588 ; GFX9-LABEL: v_fadd_bf16_fpimm_0:
12590 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12591 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12592 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
12593 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12594 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12595 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12596 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
12597 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12598 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12599 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12600 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12602 ; GFX10-LABEL: v_fadd_bf16_fpimm_0:
12604 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12605 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12606 ; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0
12607 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
12608 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
12609 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12610 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12611 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12612 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12613 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12615 ; GFX11-LABEL: v_fadd_bf16_fpimm_0:
12617 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12618 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12619 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12620 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
12621 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
12622 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
12623 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12624 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12625 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12626 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12627 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12628 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12629 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12630 %add = fadd bfloat %arg0, 1.0
12634 define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
12635 ; GCN-LABEL: v_fadd_bf16_fpimm_1:
12637 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12638 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12639 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12640 ; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12641 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12642 ; GCN-NEXT: s_setpc_b64 s[30:31]
12644 ; GFX7-LABEL: v_fadd_bf16_fpimm_1:
12646 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12647 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12648 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12649 ; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12650 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12651 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12653 ; GFX8-LABEL: v_fadd_bf16_fpimm_1:
12655 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12656 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12657 ; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12658 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12659 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12660 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12661 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
12662 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12663 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12664 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12665 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12667 ; GFX9-LABEL: v_fadd_bf16_fpimm_1:
12669 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12670 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12671 ; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12672 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12673 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12674 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12675 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
12676 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12677 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12678 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12679 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12681 ; GFX10-LABEL: v_fadd_bf16_fpimm_1:
12683 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12684 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12685 ; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12686 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
12687 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
12688 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12689 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12690 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12691 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12692 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12694 ; GFX11-LABEL: v_fadd_bf16_fpimm_1:
12696 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12697 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12698 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12699 ; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0
12700 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
12701 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
12702 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12703 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12704 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12705 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12706 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12707 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12708 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12709 %add = fadd bfloat %arg0, 42.0
12713 define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
12714 ; GCN-LABEL: v_fsub_bf16:
12716 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12717 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12718 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
12719 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12720 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12721 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
12722 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12723 ; GCN-NEXT: s_setpc_b64 s[30:31]
12725 ; GFX7-LABEL: v_fsub_bf16:
12727 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12728 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12729 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
12730 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12731 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12732 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
12733 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12734 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12736 ; GFX8-LABEL: v_fsub_bf16:
12738 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12739 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12740 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12741 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
12742 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12743 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12744 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12745 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
12746 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12747 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12748 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12749 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12751 ; GFX9-LABEL: v_fsub_bf16:
12753 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12754 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12755 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12756 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
12757 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12758 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12759 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12760 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
12761 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12762 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
12763 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12764 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12766 ; GFX10-LABEL: v_fsub_bf16:
12768 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12769 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12770 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12771 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
12772 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
12773 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
12774 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12775 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12776 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12777 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12778 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12780 ; GFX11-LABEL: v_fsub_bf16:
12782 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12783 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12784 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
12785 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12786 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
12787 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
12788 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
12789 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12790 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12791 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
12792 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
12793 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12794 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12795 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12796 %op = fsub bfloat %a, %b
12800 define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
12801 ; GCN-LABEL: v_fsub_v2bf16:
12803 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12804 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12805 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
12806 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
12807 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
12808 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12809 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12810 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12811 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12812 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v3
12813 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v2
12814 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12815 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12816 ; GCN-NEXT: s_setpc_b64 s[30:31]
12818 ; GFX7-LABEL: v_fsub_v2bf16:
12820 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12821 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12822 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
12823 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
12824 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
12825 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12826 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12827 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12828 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12829 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
12830 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2
12831 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12832 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12833 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12835 ; GFX8-LABEL: v_fsub_v2bf16:
12837 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12838 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12839 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12840 ; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
12841 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
12842 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
12843 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12844 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12845 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
12846 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
12847 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
12848 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
12849 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
12850 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
12851 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
12852 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
12853 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
12854 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12855 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
12856 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
12857 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
12858 ; GFX8-NEXT: s_setpc_b64 s[30:31]
12860 ; GFX9-LABEL: v_fsub_v2bf16:
12862 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12863 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12864 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12865 ; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
12866 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12867 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12868 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
12869 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
12870 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
12871 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
12872 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
12873 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
12874 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
12875 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
12876 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
12877 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
12878 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
12879 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
12880 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
12881 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
12882 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12884 ; GFX10-LABEL: v_fsub_v2bf16:
12886 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12887 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12888 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12889 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12890 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12891 ; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2
12892 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
12893 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
12894 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
12895 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
12896 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12897 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
12898 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
12899 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
12900 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
12901 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12902 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
12903 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
12904 ; GFX10-NEXT: s_setpc_b64 s[30:31]
12906 ; GFX11-LABEL: v_fsub_v2bf16:
12908 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12909 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
12910 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12911 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
12912 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12913 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
12914 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
12915 ; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2
12916 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12917 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
12918 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
12919 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
12920 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
12921 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
12922 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
12923 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
12924 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
12925 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
12926 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
12927 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
12928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
12929 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
12930 ; GFX11-NEXT: s_setpc_b64 s[30:31]
12931 %op = fsub <2 x bfloat> %a, %b
12932 ret <2 x bfloat> %op
12935 define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
12936 ; GCN-LABEL: v_fsub_v3bf16:
12938 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12939 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
12940 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
12941 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
12942 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
12943 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
12944 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
12945 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12946 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12947 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12948 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12949 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12950 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12951 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
12952 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
12953 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v3
12954 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12955 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12956 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12957 ; GCN-NEXT: s_setpc_b64 s[30:31]
12959 ; GFX7-LABEL: v_fsub_v3bf16:
12961 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12962 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
12963 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
12964 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
12965 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
12966 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
12967 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
12968 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
12969 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12970 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
12971 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12972 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
12973 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12974 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
12975 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v4
12976 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3
12977 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
12978 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
12979 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
12980 ; GFX7-NEXT: s_setpc_b64 s[30:31]
12982 ; GFX8-LABEL: v_fsub_v3bf16:
12984 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12985 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
12986 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
12987 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
12988 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
12989 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
12990 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
12991 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
12992 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
12993 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
12994 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
12995 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
12996 ; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
12997 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
12998 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
12999 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
13000 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13001 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13002 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
13003 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
13004 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
13005 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13006 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13007 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13008 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13009 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13010 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
13011 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13012 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13013 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13014 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13015 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13016 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13018 ; GFX9-LABEL: v_fsub_v3bf16:
13020 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13021 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13022 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13023 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
13024 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13025 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13026 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13027 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
13028 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13029 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
13030 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13031 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
13032 ; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3
13033 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13034 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13035 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
13036 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
13037 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
13038 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
13039 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13040 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13041 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13042 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13043 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
13044 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13045 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13046 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13047 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13048 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
13049 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13051 ; GFX10-LABEL: v_fsub_v3bf16:
13053 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13054 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13055 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13056 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13057 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13058 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13059 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13060 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
13061 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
13062 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
13063 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
13064 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13065 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
13066 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13067 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
13068 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13069 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13070 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13071 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
13072 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13073 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13074 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13075 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13076 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13077 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13078 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13079 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
13080 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13082 ; GFX11TRUE16-LABEL: v_fsub_v3bf16:
13083 ; GFX11TRUE16: ; %bb.0:
13084 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13085 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13086 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13087 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13088 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13089 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13090 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13091 ; GFX11TRUE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13092 ; GFX11TRUE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
13093 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13094 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13095 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13096 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13097 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13098 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13099 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13100 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13101 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13102 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13103 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13104 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13105 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13106 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13107 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13108 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13109 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13110 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13111 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13112 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
13113 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
13115 ; GFX11FAKE16-LABEL: v_fsub_v3bf16:
13116 ; GFX11FAKE16: ; %bb.0:
13117 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13118 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13119 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13120 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13121 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13122 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13123 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13124 ; GFX11FAKE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13125 ; GFX11FAKE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
13126 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13127 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13128 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13129 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13130 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13131 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13132 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13133 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13134 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13135 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13136 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13137 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13138 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13139 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13140 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13141 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13142 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13143 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13144 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13145 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
13146 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
13147 %op = fsub <3 x bfloat> %a, %b
13148 ret <3 x bfloat> %op
13151 define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
13152 ; GCN-LABEL: v_fsub_v4bf16:
13154 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13155 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13156 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
13157 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13158 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
13159 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13160 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
13161 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13162 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
13163 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13164 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13165 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13166 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13167 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13168 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13169 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13170 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13171 ; GCN-NEXT: v_sub_f32_e32 v3, v3, v7
13172 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v6
13173 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v5
13174 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v4
13175 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13176 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13177 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13178 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13179 ; GCN-NEXT: s_setpc_b64 s[30:31]
13181 ; GFX7-LABEL: v_fsub_v4bf16:
13183 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13184 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13185 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
13186 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13187 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
13188 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13189 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
13190 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13191 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
13192 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13193 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13194 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13195 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13196 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13197 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13198 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13199 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13200 ; GFX7-NEXT: v_sub_f32_e32 v3, v3, v7
13201 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
13202 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v5
13203 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v4
13204 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13205 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13206 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13207 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13208 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13210 ; GFX8-LABEL: v_fsub_v4bf16:
13212 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13213 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13214 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13215 ; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4
13216 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
13217 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
13218 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13219 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13220 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
13221 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
13222 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
13223 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13224 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
13225 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
13226 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13227 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
13228 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
13229 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
13230 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13231 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13232 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13233 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13234 ; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
13235 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
13236 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
13237 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13238 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13239 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
13240 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
13241 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
13242 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13243 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13244 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13245 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13246 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13247 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
13248 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13249 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13250 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13251 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13252 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13253 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
13254 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13256 ; GFX9-LABEL: v_fsub_v4bf16:
13258 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13259 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13260 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13261 ; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
13262 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13263 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13264 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
13265 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13266 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
13267 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
13268 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
13269 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13270 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13271 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13272 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13273 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
13274 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13275 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13276 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13277 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13278 ; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
13279 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13280 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13281 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
13282 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
13283 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
13284 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
13285 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13286 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13287 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13288 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13289 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
13290 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13291 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13292 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13293 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13294 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
13295 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13297 ; GFX10-LABEL: v_fsub_v4bf16:
13299 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13300 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13301 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13302 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13303 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13304 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
13305 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
13306 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
13307 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13308 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13309 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
13310 ; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6
13311 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
13312 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13313 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
13314 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13315 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
13316 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
13317 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
13318 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
13319 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
13320 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
13321 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13322 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
13323 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
13324 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
13325 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13326 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13327 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13328 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13329 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13330 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13331 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
13332 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
13333 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
13334 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13336 ; GFX11-LABEL: v_fsub_v4bf16:
13338 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13339 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
13340 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
13341 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13342 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13343 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13344 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13345 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13346 ; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
13347 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13348 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
13349 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13350 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3
13351 ; GFX11-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
13352 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
13353 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
13354 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
13355 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
13356 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
13357 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13358 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13359 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
13360 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
13361 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
13362 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
13363 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13364 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
13365 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
13366 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
13367 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
13368 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13369 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13370 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13371 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13372 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13373 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
13374 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
13375 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
13376 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
13377 ; GFX11-NEXT: s_setpc_b64 s[30:31]
13378 %op = fsub <4 x bfloat> %a, %b
13379 ret <4 x bfloat> %op
13382 define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
13383 ; GCN-LABEL: v_fmul_bf16:
13385 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13386 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13387 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13388 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13389 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13390 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
13391 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13392 ; GCN-NEXT: s_setpc_b64 s[30:31]
13394 ; GFX7-LABEL: v_fmul_bf16:
13396 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13397 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13398 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13399 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13401 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
13402 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13403 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13405 ; GFX8-LABEL: v_fmul_bf16:
13407 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13408 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13409 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13410 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
13411 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
13412 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
13413 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
13414 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
13415 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13416 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
13417 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13418 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13420 ; GFX9-LABEL: v_fmul_bf16:
13422 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13423 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13424 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13425 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
13426 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
13427 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13428 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
13429 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
13430 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13431 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
13432 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13433 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13435 ; GFX10-LABEL: v_fmul_bf16:
13437 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13438 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13439 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13440 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
13441 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
13442 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
13443 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13444 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
13445 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
13446 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13447 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13449 ; GFX11-LABEL: v_fmul_bf16:
13451 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13452 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13453 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
13454 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13455 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
13456 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
13457 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
13458 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
13460 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
13461 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
13462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
13463 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13464 ; GFX11-NEXT: s_setpc_b64 s[30:31]
13465 %op = fmul bfloat %a, %b
13469 define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
13470 ; GCN-LABEL: v_fmul_v2bf16:
13472 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13473 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13474 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13475 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13476 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13477 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13478 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13479 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13480 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13481 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
13482 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
13483 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13484 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13485 ; GCN-NEXT: s_setpc_b64 s[30:31]
13487 ; GFX7-LABEL: v_fmul_v2bf16:
13489 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13490 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13491 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13492 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13493 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13494 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13495 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13496 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13497 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13498 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
13499 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
13500 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13501 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13502 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13504 ; GFX8-LABEL: v_fmul_v2bf16:
13506 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13507 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13508 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13509 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
13510 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
13511 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
13512 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13513 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13514 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
13515 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
13516 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
13517 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
13518 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
13519 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
13520 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
13521 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
13522 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
13523 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13524 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
13525 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13526 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
13527 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13529 ; GFX9-LABEL: v_fmul_v2bf16:
13531 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13532 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13533 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13534 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
13535 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13536 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13537 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
13538 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13539 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
13540 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
13541 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
13542 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
13543 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
13544 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
13545 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
13546 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
13547 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13548 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
13549 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13550 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
13551 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13553 ; GFX10-LABEL: v_fmul_v2bf16:
13555 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13556 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13557 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13558 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13559 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13560 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
13561 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
13562 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
13563 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
13564 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
13565 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
13566 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
13567 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
13568 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
13569 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
13570 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13571 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
13572 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
13573 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13575 ; GFX11-LABEL: v_fmul_v2bf16:
13577 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13578 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
13579 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13580 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
13581 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13582 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
13583 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
13584 ; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2
13585 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13586 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
13587 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
13588 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
13589 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
13590 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
13591 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
13592 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
13593 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
13594 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
13595 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13596 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
13597 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
13598 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
13599 ; GFX11-NEXT: s_setpc_b64 s[30:31]
13600 %op = fmul <2 x bfloat> %a, %b
13601 ret <2 x bfloat> %op
13604 define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
13605 ; GCN-LABEL: v_fmul_v3bf16:
13607 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13608 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13609 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13610 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13611 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
13612 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13613 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
13614 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13615 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13616 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13617 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13618 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13619 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13620 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
13621 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
13622 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
13623 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13624 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13625 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13626 ; GCN-NEXT: s_setpc_b64 s[30:31]
13628 ; GFX7-LABEL: v_fmul_v3bf16:
13630 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13631 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13632 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13633 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13634 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
13635 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13636 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
13637 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13638 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13639 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13640 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13641 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13642 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13643 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
13644 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
13645 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
13646 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13647 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13648 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13649 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13651 ; GFX8-LABEL: v_fmul_v3bf16:
13653 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13654 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13655 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13656 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
13657 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
13658 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
13659 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
13660 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
13661 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13662 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
13663 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13664 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
13665 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
13666 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
13667 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
13668 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
13669 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13670 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13671 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
13672 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
13673 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
13674 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13675 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13676 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13677 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13678 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13679 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
13680 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13681 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13682 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13683 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13684 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13685 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13687 ; GFX9-LABEL: v_fmul_v3bf16:
13689 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13690 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13691 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13692 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
13693 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13694 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13695 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13696 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
13697 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13698 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
13699 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13700 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
13701 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
13702 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13703 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13704 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
13705 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
13706 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
13707 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
13708 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13709 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13710 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
13711 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13712 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
13713 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13714 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
13715 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13716 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13717 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
13718 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13720 ; GFX10-LABEL: v_fmul_v3bf16:
13722 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13723 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13724 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13725 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13726 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13727 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13728 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
13729 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
13730 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
13731 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
13732 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
13733 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13734 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
13735 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13736 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
13737 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13738 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13739 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13740 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
13741 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13742 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13743 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13744 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13745 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13746 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13747 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13748 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
13749 ; GFX10-NEXT: s_setpc_b64 s[30:31]
13751 ; GFX11TRUE16-LABEL: v_fmul_v3bf16:
13752 ; GFX11TRUE16: ; %bb.0:
13753 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13754 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13755 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13756 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13757 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13758 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13759 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13760 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13761 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
13762 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13763 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13764 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13765 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13766 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13767 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13768 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13769 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13770 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13771 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13772 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13773 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13774 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13775 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13776 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13777 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13778 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13779 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13780 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13781 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
13782 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
13784 ; GFX11FAKE16-LABEL: v_fmul_v3bf16:
13785 ; GFX11FAKE16: ; %bb.0:
13786 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13787 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
13788 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13789 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
13790 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13791 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13792 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13793 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
13794 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
13795 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
13796 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
13797 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
13798 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13799 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
13800 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
13801 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
13802 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
13803 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
13804 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
13805 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
13806 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
13807 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13808 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
13809 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
13810 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13811 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
13812 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
13813 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
13814 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
13815 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
13816 %op = fmul <3 x bfloat> %a, %b
13817 ret <3 x bfloat> %op
13820 define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
13821 ; GCN-LABEL: v_fmul_v4bf16:
13823 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13824 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
13825 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
13826 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
13827 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
13828 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
13829 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
13830 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
13831 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
13832 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13833 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13834 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13835 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13836 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13837 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13838 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13839 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13840 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
13841 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
13842 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
13843 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
13844 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13845 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13846 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13847 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13848 ; GCN-NEXT: s_setpc_b64 s[30:31]
13850 ; GFX7-LABEL: v_fmul_v4bf16:
13852 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13853 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
13854 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
13855 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
13856 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
13857 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
13858 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
13859 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
13860 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
13861 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
13862 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13863 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
13864 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13865 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
13866 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13867 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
13868 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13869 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
13870 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
13871 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
13872 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
13873 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13874 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13875 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13876 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13877 ; GFX7-NEXT: s_setpc_b64 s[30:31]
13879 ; GFX8-LABEL: v_fmul_v4bf16:
13881 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13882 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13883 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13884 ; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4
13885 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
13886 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
13887 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13888 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13889 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
13890 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
13891 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
13892 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13893 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
13894 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
13895 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13896 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
13897 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
13898 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
13899 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13900 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13901 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13902 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13903 ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
13904 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
13905 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
13906 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13907 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13908 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
13909 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
13910 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
13911 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13912 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
13913 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13914 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
13915 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
13916 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
13917 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13918 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13919 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
13920 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
13921 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
13922 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
13923 ; GFX8-NEXT: s_setpc_b64 s[30:31]
13925 ; GFX9-LABEL: v_fmul_v4bf16:
13927 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13928 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13929 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13930 ; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
13931 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13932 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13933 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
13934 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
13935 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
13936 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
13937 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
13938 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
13939 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
13940 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
13941 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
13942 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
13943 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
13944 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13945 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
13946 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
13947 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
13948 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13949 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13950 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
13951 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
13952 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
13953 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
13954 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
13955 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
13956 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
13957 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
13958 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
13959 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
13960 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
13961 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
13962 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
13963 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
13964 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13966 ; GFX10-LABEL: v_fmul_v4bf16:
13968 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13969 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
13970 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
13971 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
13972 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
13973 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
13974 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
13975 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
13976 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
13977 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13978 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
13979 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
13980 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
13981 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
13982 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
13983 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
13984 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
13985 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
13986 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
13987 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
13988 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
13989 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
13990 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
13991 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
13992 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
13993 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
13994 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
13995 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13996 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
13997 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
13998 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
13999 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14000 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
14001 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
14002 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14003 ; GFX10-NEXT: s_setpc_b64 s[30:31]
14005 ; GFX11-LABEL: v_fmul_v4bf16:
14007 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14008 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
14009 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
14010 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14011 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14012 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
14013 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
14014 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14015 ; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
14016 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14017 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
14018 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14019 ; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3
14020 ; GFX11-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
14021 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
14022 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
14023 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
14024 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
14025 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
14026 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
14027 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
14028 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
14029 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
14030 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
14031 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
14032 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
14033 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
14034 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
14035 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
14036 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
14037 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
14038 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
14039 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14040 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
14041 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14042 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
14043 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
14044 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
14045 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14046 ; GFX11-NEXT: s_setpc_b64 s[30:31]
14047 %op = fmul <4 x bfloat> %a, %b
14048 ret <4 x bfloat> %op
14051 define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
14052 ; GCN-LABEL: v_fmul_v8bf16:
14054 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14055 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
14056 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
14057 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
14058 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
14059 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
14060 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
14061 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
14062 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
14063 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
14064 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
14065 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
14066 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
14067 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
14068 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
14069 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
14070 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
14071 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14072 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14073 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14074 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14075 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14076 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14077 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14078 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14079 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14080 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14081 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14082 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14083 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14084 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14085 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14086 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14087 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
14088 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
14089 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
14090 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
14091 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
14092 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
14093 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
14094 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
14095 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14096 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14097 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14098 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14099 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14100 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14101 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14102 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14103 ; GCN-NEXT: s_setpc_b64 s[30:31]
14105 ; GFX7-LABEL: v_fmul_v8bf16:
14107 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14108 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
14109 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
14110 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
14111 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
14112 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
14113 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
14114 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
14115 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
14116 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
14117 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
14118 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
14119 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
14120 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
14121 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
14122 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
14123 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
14124 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14125 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14126 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14127 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14128 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14129 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14130 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14131 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14132 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14133 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14134 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14135 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14136 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14137 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14138 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14139 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14140 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15
14141 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14
14142 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13
14143 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12
14144 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11
14145 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10
14146 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
14147 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
14148 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14149 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14150 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14151 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14152 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14153 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14154 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14155 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14156 ; GFX7-NEXT: s_setpc_b64 s[30:31]
14158 ; GFX8-LABEL: v_fmul_v8bf16:
14160 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14161 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14162 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14163 ; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
14164 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
14165 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
14166 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14167 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14168 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
14169 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7
14170 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
14171 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
14172 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
14173 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
14174 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
14175 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
14176 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
14177 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
14178 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14179 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
14180 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
14181 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
14182 ; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
14183 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
14184 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
14185 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14186 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14187 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14188 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
14189 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
14190 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14191 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
14192 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
14193 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
14194 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
14195 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
14196 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14197 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
14198 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14199 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
14200 ; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
14201 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
14202 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
14203 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14204 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14205 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14206 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
14207 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
14208 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14209 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
14210 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
14211 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
14212 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
14213 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
14214 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14215 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
14216 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
14217 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
14218 ; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
14219 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
14220 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
14221 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14222 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14223 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14224 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4
14225 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
14226 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14227 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
14228 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
14229 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
14230 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
14231 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
14232 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14233 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
14234 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
14235 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
14236 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
14237 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
14238 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
14239 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
14240 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
14241 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
14242 ; GFX8-NEXT: s_setpc_b64 s[30:31]
14244 ; GFX9-LABEL: v_fmul_v8bf16:
14246 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14247 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14248 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14249 ; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
14250 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14251 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14252 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
14253 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
14254 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
14255 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
14256 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
14257 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
14258 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
14259 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
14260 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
14261 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
14262 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14263 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
14264 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
14265 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
14266 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
14267 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14268 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14269 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
14270 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
14271 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
14272 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
14273 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14274 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
14275 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
14276 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
14277 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
14278 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14279 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
14280 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14281 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
14282 ; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
14283 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14284 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14285 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
14286 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
14287 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
14288 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
14289 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14290 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
14291 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
14292 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
14293 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
14294 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14295 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
14296 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
14297 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
14298 ; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
14299 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14300 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14301 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
14302 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
14303 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
14304 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
14305 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14306 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
14307 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
14308 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
14309 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
14310 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14311 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
14312 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
14313 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
14314 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
14315 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
14316 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
14317 ; GFX9-NEXT: s_setpc_b64 s[30:31]
14319 ; GFX10-LABEL: v_fmul_v8bf16:
14321 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14322 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14323 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14324 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14325 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14326 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
14327 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14328 ; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8
14329 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
14330 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14331 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
14332 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
14333 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
14334 ; GFX10-NEXT: v_mul_f32_e32 v7, v10, v9
14335 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
14336 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
14337 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
14338 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
14339 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
14340 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
14341 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14342 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
14343 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
14344 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
14345 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
14346 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
14347 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
14348 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
14349 ; GFX10-NEXT: v_mul_f32_e32 v6, v10, v6
14350 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
14351 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14352 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14353 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
14354 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
14355 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
14356 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
14357 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14358 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14359 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
14360 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
14361 ; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13
14362 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
14363 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
14364 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
14365 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
14366 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
14367 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
14368 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
14369 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
14370 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
14371 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
14372 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
14373 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
14374 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
14375 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
14376 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
14377 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
14378 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
14379 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
14380 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
14381 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
14382 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
14383 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14384 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
14385 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
14386 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
14387 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14388 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
14389 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
14390 ; GFX10-NEXT: s_setpc_b64 s[30:31]
14392 ; GFX11-LABEL: v_fmul_v8bf16:
14394 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14395 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
14396 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
14397 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14398 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
14399 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
14400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
14401 ; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
14402 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
14403 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
14404 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14405 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
14406 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14407 ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7
14408 ; GFX11-NEXT: v_mul_f32_e32 v7, v10, v9
14409 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
14410 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
14411 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14412 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
14413 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
14414 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
14415 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
14416 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
14417 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
14418 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
14419 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
14420 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
14421 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14422 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
14423 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14424 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
14425 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
14426 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
14427 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14428 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
14429 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14430 ; GFX11-NEXT: v_mul_f32_e32 v6, v10, v6
14431 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
14432 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
14433 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
14434 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
14435 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
14436 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
14437 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
14438 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
14439 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
14440 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14441 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14442 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
14443 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
14444 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
14445 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
14446 ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
14447 ; GFX11-NEXT: v_mul_f32_e32 v5, v15, v13
14448 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14449 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
14450 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
14451 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
14452 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
14453 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
14454 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
14455 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
14456 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
14457 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
14458 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
14459 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
14460 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
14461 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
14462 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
14463 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
14464 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
14465 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
14466 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
14467 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
14468 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
14469 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
14470 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
14471 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
14472 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
14473 ; GFX11-NEXT: s_setpc_b64 s[30:31]
14474 %op = fmul <8 x bfloat> %a, %b
14475 ret <8 x bfloat> %op
14478 define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
14479 ; GCN-LABEL: v_fmul_v16bf16:
14481 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14482 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
14483 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
14484 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
14485 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14486 ; GCN-NEXT: v_mul_f32_e32 v14, v14, v30
14487 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
14488 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
14489 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
14490 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14491 ; GCN-NEXT: v_mul_f32_e32 v13, v13, v29
14492 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
14493 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
14494 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
14495 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14496 ; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
14497 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
14498 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
14499 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
14500 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14501 ; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
14502 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
14503 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
14504 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
14505 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14506 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
14507 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
14508 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
14509 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
14510 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14511 ; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
14512 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
14513 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
14514 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
14515 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14516 ; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
14517 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
14518 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
14519 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
14520 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14521 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
14522 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
14523 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
14524 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
14525 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14526 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
14527 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
14528 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
14529 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
14530 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14531 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
14532 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
14533 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
14534 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
14535 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
14536 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
14537 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
14538 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
14539 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
14540 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
14541 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
14542 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
14543 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
14544 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14545 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
14546 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
14547 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14548 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
14549 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14550 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
14551 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14552 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
14553 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14554 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
14555 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14556 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
14557 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
14558 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
14559 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
14560 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14561 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14562 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14563 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14564 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14565 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14566 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14567 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14568 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14569 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14570 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14571 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14572 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14573 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14574 ; GCN-NEXT: s_waitcnt vmcnt(0)
14575 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
14576 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
14577 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
14578 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14579 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14580 ; GCN-NEXT: s_setpc_b64 s[30:31]
14582 ; GFX7-LABEL: v_fmul_v16bf16:
14584 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14585 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
14586 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
14587 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
14588 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14589 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
14590 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
14591 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
14592 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
14593 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
14594 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
14595 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
14596 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
14597 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
14598 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
14599 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
14600 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
14601 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
14602 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
14603 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
14604 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
14605 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
14606 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
14607 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
14608 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
14609 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
14610 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
14611 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
14612 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
14613 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
14614 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
14615 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
14616 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
14617 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
14618 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
14619 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
14620 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
14621 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14622 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
14623 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14624 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
14625 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14626 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
14627 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14628 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
14629 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14630 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
14631 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14632 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
14633 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14634 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
14635 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14636 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14637 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
14638 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14639 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
14640 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14641 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
14642 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14643 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
14644 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14645 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
14646 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14647 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
14648 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14649 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
14650 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
14651 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
14652 ; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
14653 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
14654 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
14655 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
14656 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23
14657 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21
14658 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20
14659 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19
14660 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18
14661 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17
14662 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
14663 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14664 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14665 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14666 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14667 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14668 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14669 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14670 ; GFX7-NEXT: s_waitcnt vmcnt(0)
14671 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
14672 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
14673 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
14674 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14675 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14676 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14677 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14678 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14679 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14680 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14681 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14682 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14683 ; GFX7-NEXT: s_setpc_b64 s[30:31]
14685 ; GFX8-LABEL: v_fmul_v16bf16:
14687 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14688 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
14689 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
14690 ; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
14691 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
14692 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
14693 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
14694 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14695 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14696 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14697 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15
14698 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
14699 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
14700 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
14701 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
14702 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
14703 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
14704 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
14705 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14706 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
14707 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
14708 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
14709 ; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15
14710 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
14711 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
14712 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14713 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14714 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14715 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14
14716 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
14717 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
14718 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
14719 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
14720 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
14721 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
14722 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
14723 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14724 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
14725 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
14726 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
14727 ; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14
14728 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
14729 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
14730 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14731 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14732 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14733 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13
14734 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
14735 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
14736 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
14737 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
14738 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
14739 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
14740 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
14741 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14742 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
14743 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
14744 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
14745 ; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13
14746 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
14747 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
14748 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14749 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14750 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14751 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12
14752 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
14753 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
14754 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
14755 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
14756 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
14757 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
14758 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
14759 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
14760 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
14761 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
14762 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
14763 ; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12
14764 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
14765 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
14766 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14767 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14768 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14769 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11
14770 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
14771 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
14772 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
14773 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
14774 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
14775 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
14776 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
14777 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14778 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
14779 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
14780 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
14781 ; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11
14782 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
14783 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
14784 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14785 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14786 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14787 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10
14788 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
14789 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
14790 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
14791 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
14792 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
14793 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
14794 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
14795 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14796 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
14797 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
14798 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
14799 ; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10
14800 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
14801 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
14802 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14803 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14804 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14805 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9
14806 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
14807 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
14808 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
14809 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
14810 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
14811 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
14812 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
14813 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14814 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
14815 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
14816 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
14817 ; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9
14818 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
14819 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
14820 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14821 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14822 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
14823 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8
14824 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
14825 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
14826 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
14827 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
14828 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
14829 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
14830 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
14831 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14832 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
14833 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
14834 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
14835 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
14836 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
14837 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
14838 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
14839 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
14840 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
14841 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
14842 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
14843 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
14844 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
14845 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
14846 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
14847 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
14848 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
14849 ; GFX8-NEXT: s_setpc_b64 s[30:31]
14851 ; GFX9-LABEL: v_fmul_v16bf16:
14853 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14854 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
14855 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
14856 ; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
14857 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
14858 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
14859 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
14860 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
14861 ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
14862 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
14863 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
14864 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
14865 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
14866 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
14867 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
14868 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
14869 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
14870 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
14871 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
14872 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
14873 ; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
14874 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
14875 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
14876 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
14877 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
14878 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
14879 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
14880 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
14881 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
14882 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
14883 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
14884 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
14885 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
14886 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
14887 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
14888 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
14889 ; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
14890 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
14891 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
14892 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
14893 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
14894 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
14895 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
14896 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
14897 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
14898 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
14899 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
14900 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
14901 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
14902 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
14903 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
14904 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
14905 ; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
14906 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
14907 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
14908 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
14909 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
14910 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
14911 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
14912 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
14913 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
14914 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
14915 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
14916 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
14917 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
14918 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
14919 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
14920 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
14921 ; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
14922 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
14923 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
14924 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
14925 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
14926 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
14927 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
14928 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
14929 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
14930 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
14931 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
14932 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
14933 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14934 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
14935 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
14936 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
14937 ; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
14938 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
14939 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
14940 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
14941 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
14942 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
14943 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
14944 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
14945 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
14946 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
14947 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
14948 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
14949 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
14950 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
14951 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
14952 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
14953 ; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
14954 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
14955 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
14956 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
14957 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
14958 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
14959 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
14960 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
14961 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
14962 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
14963 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
14964 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
14965 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
14966 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
14967 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
14968 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
14969 ; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
14970 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
14971 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
14972 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
14973 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
14974 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
14975 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
14976 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
14977 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
14978 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
14979 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
14980 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
14981 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
14982 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
14983 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
14984 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
14985 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
14986 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
14987 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
14988 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
14989 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
14990 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
14991 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
14992 ; GFX9-NEXT: s_setpc_b64 s[30:31]
14994 ; GFX10-LABEL: v_fmul_v16bf16:
14996 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14997 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
14998 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
14999 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15000 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15001 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
15002 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15003 ; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16
15004 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
15005 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
15006 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15007 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
15008 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
15009 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
15010 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
15011 ; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17
15012 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
15013 ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
15014 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
15015 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
15016 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
15017 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
15018 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
15019 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
15020 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
15021 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
15022 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15023 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
15024 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
15025 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
15026 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15027 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15028 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
15029 ; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19
15030 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
15031 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13
15032 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
15033 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
15034 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
15035 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
15036 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
15037 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
15038 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
15039 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15040 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15041 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
15042 ; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18
15043 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
15044 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
15045 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15046 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
15047 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
15048 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
15049 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
15050 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
15051 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
15052 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
15053 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
15054 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
15055 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15056 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
15057 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
15058 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15059 ; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12
15060 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
15061 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
15062 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
15063 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
15064 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
15065 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
15066 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
15067 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
15068 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
15069 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
15070 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15071 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
15072 ; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18
15073 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15074 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
15075 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
15076 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
15077 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
15078 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
15079 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
15080 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
15081 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
15082 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
15083 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
15084 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15085 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
15086 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
15087 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
15088 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
15089 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15090 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
15091 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
15092 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
15093 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
15094 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
15095 ; GFX10-NEXT: v_mul_f32_e32 v19, v22, v20
15096 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
15097 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
15098 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15099 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15100 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
15101 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
15102 ; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20
15103 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
15104 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
15105 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
15106 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
15107 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
15108 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
15109 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
15110 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
15111 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
15112 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
15113 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
15114 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
15115 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
15116 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
15117 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
15118 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
15119 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
15120 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
15121 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
15122 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
15123 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
15124 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
15125 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
15126 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
15127 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
15128 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
15129 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
15130 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
15131 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
15132 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
15133 ; GFX10-NEXT: s_setpc_b64 s[30:31]
15135 ; GFX11-LABEL: v_fmul_v16bf16:
15137 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15138 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
15139 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
15140 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15141 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
15142 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
15143 ; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
15144 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
15145 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15146 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
15147 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
15148 ; GFX11-NEXT: v_mul_f32_e32 v17, v18, v17
15149 ; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14
15150 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
15151 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15152 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
15153 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
15154 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15155 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
15156 ; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15
15157 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
15158 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
15159 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
15160 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15161 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
15162 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
15163 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
15164 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
15165 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
15166 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
15167 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
15168 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
15169 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
15170 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15171 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
15172 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
15173 ; GFX11-NEXT: v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
15174 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
15175 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
15176 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
15177 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15178 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15179 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15180 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
15181 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
15182 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
15183 ; GFX11-NEXT: v_mul_f32_e32 v4, v4, v12
15184 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
15185 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15186 ; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13
15187 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
15188 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
15189 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
15190 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
15191 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
15192 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
15193 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
15194 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
15195 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
15196 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
15197 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
15198 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
15199 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
15200 ; GFX11-NEXT: v_mul_f32_e32 v12, v18, v12
15201 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
15202 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
15203 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
15204 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
15205 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
15206 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15207 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
15208 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
15209 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
15210 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
15211 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
15212 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
15213 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
15214 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
15215 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
15216 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
15217 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
15218 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15219 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15220 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
15221 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
15222 ; GFX11-NEXT: v_mul_f32_e32 v18, v19, v18
15223 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
15224 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
15225 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15226 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15227 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
15228 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
15229 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15230 ; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
15231 ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11
15232 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
15233 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
15234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
15235 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
15236 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
15237 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
15238 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
15239 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
15240 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
15241 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
15242 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
15243 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
15244 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
15245 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
15246 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
15247 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
15248 ; GFX11-NEXT: v_mul_f32_e32 v19, v22, v20
15249 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
15250 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
15251 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15252 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
15253 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
15254 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15255 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15256 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
15257 ; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9
15258 ; GFX11-NEXT: v_mul_f32_e32 v9, v22, v20
15259 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
15260 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
15261 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
15262 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
15263 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
15264 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
15265 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
15266 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
15267 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
15268 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
15269 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
15270 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
15271 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
15272 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
15273 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
15274 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
15275 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
15276 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
15277 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
15278 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
15279 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
15280 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
15281 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
15282 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15283 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
15284 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
15285 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
15286 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
15287 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
15288 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
15289 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
15290 ; GFX11-NEXT: s_setpc_b64 s[30:31]
15291 %op = fmul <16 x bfloat> %a, %b
15292 ret <16 x bfloat> %op
15295 define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
15296 ; GCN-LABEL: v_fmul_v32bf16:
15298 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15299 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
15300 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
15301 ; GCN-NEXT: s_waitcnt vmcnt(1)
15302 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
15303 ; GCN-NEXT: s_waitcnt vmcnt(0)
15304 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
15305 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15306 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15307 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
15308 ; GCN-NEXT: v_mul_f32_e32 v31, v31, v32
15309 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
15310 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15311 ; GCN-NEXT: s_waitcnt vmcnt(0)
15312 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15313 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15314 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
15315 ; GCN-NEXT: v_mul_f32_e32 v30, v30, v32
15316 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
15317 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15318 ; GCN-NEXT: s_waitcnt vmcnt(0)
15319 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15320 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15321 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
15322 ; GCN-NEXT: v_mul_f32_e32 v29, v29, v32
15323 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
15324 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15325 ; GCN-NEXT: s_waitcnt vmcnt(0)
15326 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15327 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15328 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
15329 ; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
15330 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
15331 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15332 ; GCN-NEXT: s_waitcnt vmcnt(0)
15333 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15334 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15335 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
15336 ; GCN-NEXT: v_mul_f32_e32 v27, v27, v32
15337 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
15338 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15339 ; GCN-NEXT: s_waitcnt vmcnt(0)
15340 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15341 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15342 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
15343 ; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
15344 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
15345 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15346 ; GCN-NEXT: s_waitcnt vmcnt(0)
15347 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15348 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15349 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
15350 ; GCN-NEXT: v_mul_f32_e32 v25, v25, v32
15351 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
15352 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15353 ; GCN-NEXT: s_waitcnt vmcnt(0)
15354 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15355 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15356 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
15357 ; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
15358 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
15359 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15360 ; GCN-NEXT: s_waitcnt vmcnt(0)
15361 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15362 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15363 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
15364 ; GCN-NEXT: v_mul_f32_e32 v23, v23, v32
15365 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
15366 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15367 ; GCN-NEXT: s_waitcnt vmcnt(0)
15368 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15369 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15370 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
15371 ; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
15372 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
15373 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15374 ; GCN-NEXT: s_waitcnt vmcnt(0)
15375 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15376 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15377 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
15378 ; GCN-NEXT: v_mul_f32_e32 v21, v21, v32
15379 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
15380 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15381 ; GCN-NEXT: s_waitcnt vmcnt(0)
15382 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15383 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15384 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
15385 ; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
15386 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
15387 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15388 ; GCN-NEXT: s_waitcnt vmcnt(0)
15389 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15390 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15391 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
15392 ; GCN-NEXT: v_mul_f32_e32 v19, v19, v32
15393 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
15394 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15395 ; GCN-NEXT: s_waitcnt vmcnt(0)
15396 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15397 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15398 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
15399 ; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
15400 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
15401 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15402 ; GCN-NEXT: s_waitcnt vmcnt(0)
15403 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15404 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15405 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
15406 ; GCN-NEXT: v_mul_f32_e32 v17, v17, v32
15407 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
15408 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15409 ; GCN-NEXT: s_waitcnt vmcnt(0)
15410 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15411 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15412 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
15413 ; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
15414 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
15415 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15416 ; GCN-NEXT: s_waitcnt vmcnt(0)
15417 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15418 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15419 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
15420 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v32
15421 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
15422 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15423 ; GCN-NEXT: s_waitcnt vmcnt(0)
15424 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15425 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15426 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
15427 ; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
15428 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
15429 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15430 ; GCN-NEXT: s_waitcnt vmcnt(0)
15431 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15432 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15433 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
15434 ; GCN-NEXT: v_mul_f32_e32 v13, v13, v32
15435 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
15436 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15437 ; GCN-NEXT: s_waitcnt vmcnt(0)
15438 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15439 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15440 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
15441 ; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
15442 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
15443 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15444 ; GCN-NEXT: s_waitcnt vmcnt(0)
15445 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15446 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15447 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
15448 ; GCN-NEXT: v_mul_f32_e32 v11, v11, v32
15449 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
15450 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15451 ; GCN-NEXT: s_waitcnt vmcnt(0)
15452 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15453 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15454 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
15455 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
15456 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
15457 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15458 ; GCN-NEXT: s_waitcnt vmcnt(0)
15459 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15460 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15461 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
15462 ; GCN-NEXT: v_mul_f32_e32 v9, v9, v32
15463 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
15464 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15465 ; GCN-NEXT: s_waitcnt vmcnt(0)
15466 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15467 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15468 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
15469 ; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
15470 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
15471 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15472 ; GCN-NEXT: s_waitcnt vmcnt(0)
15473 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15474 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15475 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
15476 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v32
15477 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
15478 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15479 ; GCN-NEXT: s_waitcnt vmcnt(0)
15480 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15481 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15482 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
15483 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
15484 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
15485 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15486 ; GCN-NEXT: s_waitcnt vmcnt(0)
15487 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15488 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15489 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
15490 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v32
15491 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
15492 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15493 ; GCN-NEXT: s_waitcnt vmcnt(0)
15494 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15495 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15496 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
15497 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
15498 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
15499 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15500 ; GCN-NEXT: s_waitcnt vmcnt(0)
15501 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15502 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15503 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
15504 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v32
15505 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
15506 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15507 ; GCN-NEXT: s_waitcnt vmcnt(0)
15508 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15509 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15510 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
15511 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
15512 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
15513 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15514 ; GCN-NEXT: s_waitcnt vmcnt(0)
15515 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15516 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15517 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
15518 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v32
15519 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
15520 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15521 ; GCN-NEXT: s_waitcnt vmcnt(0)
15522 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
15523 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15524 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v32
15525 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15526 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15527 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15528 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15529 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15530 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15531 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15532 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15533 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15534 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15535 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15536 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15537 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15538 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15539 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15540 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15541 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15542 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15543 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15544 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15545 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15546 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15547 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15548 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15549 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15550 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15551 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15552 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15553 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15554 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15555 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15556 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15557 ; GCN-NEXT: s_setpc_b64 s[30:31]
15559 ; GFX7-LABEL: v_fmul_v32bf16:
15561 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15562 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
15563 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
15564 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
15565 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15566 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
15567 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15568 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
15569 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15570 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
15571 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15572 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
15573 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15574 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
15575 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15576 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
15577 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15578 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
15579 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15580 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
15581 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15582 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
15583 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15584 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
15585 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15586 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
15587 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15588 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
15589 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15590 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
15591 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15592 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
15593 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15594 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
15595 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15596 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
15597 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15598 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
15599 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15600 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
15601 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15602 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
15603 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15604 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
15605 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15606 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
15607 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15608 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
15609 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15610 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
15611 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15612 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
15613 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15614 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
15615 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15616 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
15617 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15618 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
15619 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15620 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
15621 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15622 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
15623 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15624 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
15625 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15626 ; GFX7-NEXT: s_waitcnt vmcnt(1)
15627 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
15628 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15629 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15630 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15631 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15632 ; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32
15633 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
15634 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
15635 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15636 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15637 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15638 ; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32
15639 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
15640 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15641 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15642 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15643 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15644 ; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32
15645 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
15646 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15647 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15648 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15649 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15650 ; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32
15651 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
15652 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15653 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15654 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15655 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15656 ; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32
15657 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
15658 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15659 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15660 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15661 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15662 ; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32
15663 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
15664 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15665 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15666 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15667 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15668 ; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32
15669 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
15670 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15671 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15672 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15673 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15674 ; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32
15675 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
15676 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15677 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15678 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15679 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15680 ; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32
15681 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
15682 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15683 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15684 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15685 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15686 ; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32
15687 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
15688 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15689 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15690 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15691 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15692 ; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32
15693 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
15694 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
15695 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15696 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15697 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15698 ; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32
15699 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
15700 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
15701 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15702 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15703 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15704 ; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32
15705 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
15706 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
15707 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15708 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15709 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15710 ; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32
15711 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
15712 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
15713 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15714 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15715 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15716 ; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32
15717 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
15718 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
15719 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15720 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15721 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15722 ; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32
15723 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
15724 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
15725 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15726 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15727 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15728 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32
15729 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
15730 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15731 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15732 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15733 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15734 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32
15735 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
15736 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15737 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15738 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15739 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15740 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32
15741 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
15742 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15743 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15744 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15745 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15746 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32
15747 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
15748 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15749 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15750 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15751 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15752 ; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32
15753 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
15754 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15755 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15756 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15757 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15758 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32
15759 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
15760 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15761 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15762 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15763 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15764 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32
15765 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
15766 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15767 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15768 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15769 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15770 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32
15771 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
15772 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15773 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15774 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15775 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15776 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32
15777 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
15778 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15779 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15780 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15781 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15782 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32
15783 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
15784 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15785 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15786 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15787 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15788 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32
15789 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
15790 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
15791 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15792 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15793 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15794 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32
15795 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
15796 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
15797 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15798 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15799 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15800 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32
15801 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
15802 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
15803 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15804 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15805 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15806 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32
15807 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
15808 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
15809 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15810 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15811 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15812 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32
15813 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
15814 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
15815 ; GFX7-NEXT: s_waitcnt vmcnt(0)
15816 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
15817 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
15818 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32
15819 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
15820 ; GFX7-NEXT: s_setpc_b64 s[30:31]
15822 ; GFX8-LABEL: v_fmul_v32bf16:
15824 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15825 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
15826 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
15827 ; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31
15828 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
15829 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
15830 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
15831 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15832 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
15833 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
15834 ; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30
15835 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
15836 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
15837 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
15838 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
15839 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
15840 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
15841 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
15842 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
15843 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
15844 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
15845 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
15846 ; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30
15847 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
15848 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
15849 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
15850 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
15851 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
15852 ; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
15853 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
15854 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
15855 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
15856 ; GFX8-NEXT: s_waitcnt vmcnt(0)
15857 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
15858 ; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34
15859 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
15860 ; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30
15861 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
15862 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
15863 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
15864 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
15865 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
15866 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
15867 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
15868 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
15869 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15870 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
15871 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
15872 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
15873 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
15874 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
15875 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15876 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
15877 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
15878 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
15879 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
15880 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
15881 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
15882 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
15883 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
15884 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
15885 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
15886 ; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29
15887 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
15888 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
15889 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
15890 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
15891 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15892 ; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28
15893 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
15894 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
15895 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
15896 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
15897 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
15898 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
15899 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
15900 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
15901 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
15902 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
15903 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
15904 ; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28
15905 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
15906 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
15907 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
15908 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
15909 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15910 ; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27
15911 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
15912 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
15913 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
15914 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
15915 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
15916 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
15917 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
15918 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
15919 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
15920 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
15921 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
15922 ; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27
15923 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
15924 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
15925 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
15926 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
15927 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15928 ; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26
15929 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
15930 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
15931 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
15932 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
15933 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
15934 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
15935 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
15936 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
15937 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
15938 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
15939 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
15940 ; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26
15941 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
15942 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
15943 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
15944 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
15945 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15946 ; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25
15947 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
15948 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
15949 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
15950 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
15951 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
15952 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
15953 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
15954 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
15955 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
15956 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
15957 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
15958 ; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25
15959 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
15960 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
15961 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
15962 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
15963 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15964 ; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24
15965 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
15966 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
15967 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
15968 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
15969 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
15970 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
15971 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
15972 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
15973 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
15974 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
15975 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
15976 ; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24
15977 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
15978 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
15979 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
15980 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
15981 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
15982 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23
15983 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
15984 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
15985 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
15986 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
15987 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
15988 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
15989 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
15990 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
15991 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
15992 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
15993 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
15994 ; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23
15995 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
15996 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
15997 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
15998 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
15999 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16000 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22
16001 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
16002 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
16003 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
16004 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
16005 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
16006 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
16007 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
16008 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
16009 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
16010 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
16011 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
16012 ; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22
16013 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
16014 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
16015 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16016 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16017 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16018 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21
16019 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
16020 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
16021 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
16022 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
16023 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
16024 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
16025 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
16026 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
16027 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
16028 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
16029 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
16030 ; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21
16031 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
16032 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
16033 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16034 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16035 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16036 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20
16037 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
16038 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
16039 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
16040 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
16041 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
16042 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
16043 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
16044 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
16045 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
16046 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
16047 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
16048 ; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20
16049 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
16050 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
16051 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16052 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16053 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16054 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19
16055 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
16056 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
16057 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
16058 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
16059 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
16060 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
16061 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
16062 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
16063 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
16064 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
16065 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
16066 ; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19
16067 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
16068 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
16069 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16070 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
16071 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16072 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18
16073 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
16074 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
16075 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
16076 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
16077 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
16078 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
16079 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
16080 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
16081 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
16082 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
16083 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
16084 ; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18
16085 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
16086 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
16087 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16088 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16089 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16090 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17
16091 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
16092 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
16093 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
16094 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
16095 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
16096 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
16097 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
16098 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
16099 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
16100 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
16101 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
16102 ; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17
16103 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
16104 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
16105 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16106 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16107 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
16108 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16
16109 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
16110 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
16111 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
16112 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
16113 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
16114 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
16115 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
16116 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
16117 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
16118 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
16119 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
16120 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
16121 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
16122 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
16123 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
16124 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
16125 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
16126 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
16127 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
16128 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
16129 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
16130 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
16131 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
16132 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
16133 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
16134 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
16135 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
16136 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
16137 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
16138 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
16139 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
16140 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
16141 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
16142 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
16143 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
16144 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
16145 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
16146 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
16147 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
16148 ; GFX8-NEXT: s_setpc_b64 s[30:31]
16150 ; GFX9-LABEL: v_fmul_v32bf16:
16152 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16153 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
16154 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
16155 ; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31
16156 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
16157 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
16158 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
16159 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
16160 ; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
16161 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
16162 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
16163 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
16164 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
16165 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
16166 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
16167 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
16168 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
16169 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
16170 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
16171 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
16172 ; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30
16173 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
16174 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
16175 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
16176 ; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
16177 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
16178 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
16179 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
16180 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
16181 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
16182 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
16183 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
16184 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
16185 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
16186 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
16187 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
16188 ; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29
16189 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
16190 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
16191 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
16192 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
16193 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
16194 ; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
16195 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
16196 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
16197 ; GFX9-NEXT: s_waitcnt vmcnt(0)
16198 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
16199 ; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34
16200 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
16201 ; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29
16202 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
16203 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
16204 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
16205 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
16206 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
16207 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
16208 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
16209 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
16210 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
16211 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
16212 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
16213 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
16214 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
16215 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
16216 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
16217 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
16218 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
16219 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
16220 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
16221 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
16222 ; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28
16223 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
16224 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
16225 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
16226 ; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
16227 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
16228 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
16229 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
16230 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
16231 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
16232 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
16233 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
16234 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
16235 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
16236 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
16237 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
16238 ; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27
16239 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
16240 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
16241 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
16242 ; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
16243 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
16244 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
16245 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
16246 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
16247 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
16248 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
16249 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
16250 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
16251 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
16252 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
16253 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
16254 ; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26
16255 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
16256 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
16257 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
16258 ; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
16259 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
16260 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
16261 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
16262 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
16263 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
16264 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
16265 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
16266 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
16267 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
16268 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
16269 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
16270 ; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25
16271 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
16272 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
16273 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
16274 ; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
16275 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
16276 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
16277 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
16278 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
16279 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
16280 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
16281 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
16282 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
16283 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
16284 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
16285 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
16286 ; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24
16287 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
16288 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
16289 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
16290 ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
16291 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
16292 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
16293 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
16294 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
16295 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
16296 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
16297 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
16298 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
16299 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
16300 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
16301 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
16302 ; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23
16303 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
16304 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
16305 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
16306 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
16307 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
16308 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
16309 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
16310 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
16311 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
16312 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
16313 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
16314 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
16315 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
16316 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
16317 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
16318 ; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22
16319 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16320 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16321 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
16322 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
16323 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
16324 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
16325 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
16326 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
16327 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
16328 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
16329 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
16330 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
16331 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
16332 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
16333 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
16334 ; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21
16335 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16336 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16337 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
16338 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
16339 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
16340 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
16341 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
16342 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
16343 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
16344 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
16345 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
16346 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
16347 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
16348 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
16349 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
16350 ; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20
16351 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16352 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16353 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
16354 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
16355 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
16356 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
16357 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
16358 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
16359 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
16360 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
16361 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
16362 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
16363 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
16364 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
16365 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
16366 ; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19
16367 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16368 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
16369 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
16370 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
16371 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
16372 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
16373 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
16374 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
16375 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
16376 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
16377 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
16378 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
16379 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
16380 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
16381 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
16382 ; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18
16383 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16384 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16385 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
16386 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
16387 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
16388 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
16389 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
16390 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
16391 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
16392 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
16393 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
16394 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
16395 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
16396 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
16397 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
16398 ; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17
16399 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16400 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16401 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
16402 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
16403 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
16404 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
16405 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
16406 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
16407 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
16408 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
16409 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
16410 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
16411 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
16412 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
16413 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
16414 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
16415 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
16416 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
16417 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
16418 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
16419 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
16420 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
16421 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
16422 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
16423 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
16424 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
16425 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
16426 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
16427 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
16428 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
16429 ; GFX9-NEXT: s_setpc_b64 s[30:31]
16431 ; GFX10-LABEL: v_fmul_v32bf16:
16433 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16434 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
16435 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
16436 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
16437 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
16438 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
16439 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
16440 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
16441 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
16442 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
16443 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
16444 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
16445 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
16446 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
16447 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
16448 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
16449 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
16450 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
16451 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
16452 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
16453 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
16454 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
16455 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
16456 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
16457 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
16458 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
16459 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
16460 ; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
16461 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
16462 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
16463 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
16464 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
16465 ; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
16466 ; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
16467 ; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
16468 ; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
16469 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
16470 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
16471 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
16472 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
16473 ; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
16474 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
16475 ; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
16476 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
16477 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16478 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
16479 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
16480 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
16481 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16482 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16483 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
16484 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
16485 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16486 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16487 ; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
16488 ; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53
16489 ; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
16490 ; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55
16491 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
16492 ; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65
16493 ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
16494 ; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67
16495 ; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
16496 ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
16497 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
16498 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
16499 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
16500 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
16501 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
16502 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
16503 ; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
16504 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
16505 ; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
16506 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
16507 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16508 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16509 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
16510 ; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48
16511 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
16512 ; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50
16513 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
16514 ; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
16515 ; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
16516 ; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
16517 ; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
16518 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
16519 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
16520 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
16521 ; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
16522 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
16523 ; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
16524 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
16525 ; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
16526 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16527 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16528 ; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
16529 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
16530 ; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
16531 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
16532 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16533 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16534 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
16535 ; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38
16536 ; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
16537 ; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
16538 ; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
16539 ; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
16540 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
16541 ; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
16542 ; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
16543 ; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
16544 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
16545 ; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
16546 ; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
16547 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
16548 ; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
16549 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
16550 ; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36
16551 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
16552 ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
16553 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
16554 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
16555 ; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
16556 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
16557 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
16558 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
16559 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
16560 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
16561 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
16562 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
16563 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
16564 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
16565 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
16566 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
16567 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
16568 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
16569 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
16570 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
16571 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
16572 ; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
16573 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
16574 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
16575 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
16576 ; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
16577 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
16578 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
16579 ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
16580 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
16581 ; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
16582 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
16583 ; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
16584 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
16585 ; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
16586 ; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
16587 ; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
16588 ; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
16589 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
16590 ; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
16591 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
16592 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
16593 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
16594 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
16595 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
16596 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
16597 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
16598 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
16599 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
16600 ; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
16601 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
16602 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
16603 ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
16604 ; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
16605 ; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
16606 ; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
16607 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
16608 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
16609 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
16610 ; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
16611 ; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
16612 ; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
16613 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
16614 ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
16615 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
16616 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
16617 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
16618 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
16619 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
16620 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
16621 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
16622 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
16623 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
16624 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
16625 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
16626 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
16627 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
16628 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
16629 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
16630 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
16631 ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
16632 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
16633 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
16634 ; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
16635 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
16636 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
16637 ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
16638 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
16639 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
16640 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
16641 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
16642 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
16643 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
16644 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
16645 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
16646 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
16647 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
16648 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
16649 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
16650 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
16651 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
16652 ; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
16653 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
16654 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
16655 ; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
16656 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
16657 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
16658 ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
16659 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
16660 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
16661 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
16662 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
16663 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
16664 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
16665 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
16666 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
16667 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
16668 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
16669 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
16670 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
16671 ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
16672 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
16673 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
16674 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
16675 ; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
16676 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
16677 ; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
16678 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
16679 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
16680 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
16681 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
16682 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
16683 ; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
16684 ; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
16685 ; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
16686 ; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
16687 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
16688 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
16689 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
16690 ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
16691 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
16692 ; GFX10-NEXT: s_waitcnt vmcnt(0)
16693 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
16694 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
16695 ; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17
16696 ; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18
16697 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
16698 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
16699 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
16700 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
16701 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16702 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
16703 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
16704 ; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
16705 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
16706 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
16707 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
16708 ; GFX10-NEXT: s_setpc_b64 s[30:31]
16710 ; GFX11-LABEL: v_fmul_v32bf16:
16712 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16713 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
16714 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
16715 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
16716 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
16717 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
16718 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
16719 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
16720 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
16721 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
16722 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
16723 ; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
16724 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
16725 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16726 ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
16727 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
16728 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
16729 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
16730 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
16731 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
16732 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
16733 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
16734 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
16735 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
16736 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
16737 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
16738 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
16739 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
16740 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16741 ; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
16742 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
16743 ; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
16744 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
16745 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
16746 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
16747 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16748 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
16749 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
16750 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
16751 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
16752 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
16753 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
16754 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
16755 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
16756 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16757 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
16758 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16759 ; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
16760 ; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
16761 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
16762 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
16763 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16764 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
16765 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
16766 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
16767 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
16768 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
16769 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
16770 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
16771 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
16772 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
16773 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
16774 ; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18
16775 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v16
16776 ; GFX11-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
16777 ; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23
16778 ; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
16779 ; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
16780 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
16781 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
16782 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
16783 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
16784 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
16785 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
16786 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
16787 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
16788 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
16789 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
16790 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
16791 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
16792 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
16793 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
16794 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
16795 ; GFX11-NEXT: v_mul_f32_e32 v4, v4, v20
16796 ; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71
16797 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
16798 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
16799 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
16800 ; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
16801 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
16802 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
16803 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
16804 ; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
16805 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
16806 ; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51
16807 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16808 ; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22
16809 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
16810 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
16811 ; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
16812 ; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
16813 ; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
16814 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
16815 ; GFX11-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
16816 ; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
16817 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
16818 ; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37
16819 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
16820 ; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
16821 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
16822 ; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30
16823 ; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39
16824 ; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
16825 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
16826 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
16827 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
16828 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
16829 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
16830 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
16831 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
16832 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
16833 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
16834 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
16835 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
16836 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
16837 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
16838 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
16839 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
16840 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
16841 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
16842 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
16843 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
16844 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
16845 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
16846 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
16847 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
16848 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
16849 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
16850 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
16851 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
16852 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
16853 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
16854 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
16855 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
16856 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
16857 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
16858 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
16859 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
16860 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
16861 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
16862 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
16863 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
16864 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
16865 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
16866 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
16867 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
16868 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
16869 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
16870 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
16871 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
16872 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
16873 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
16874 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
16875 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
16876 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
16877 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
16878 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
16879 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
16880 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
16881 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
16882 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
16883 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
16884 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
16885 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
16886 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
16887 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
16888 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
16889 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
16890 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
16891 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
16892 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
16893 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
16894 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
16895 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
16896 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
16897 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
16898 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
16899 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
16900 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
16901 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
16902 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
16903 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
16904 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
16905 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
16906 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
16907 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
16908 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
16909 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
16910 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
16911 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
16912 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
16913 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
16914 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
16915 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
16916 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
16917 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
16918 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
16919 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
16920 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
16921 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
16922 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
16923 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
16924 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
16925 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
16926 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
16927 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
16928 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
16929 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16930 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
16931 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
16932 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
16933 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
16934 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
16935 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
16936 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
16937 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
16938 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
16939 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
16940 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16941 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
16942 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
16943 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
16944 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
16945 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
16946 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
16947 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
16948 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
16949 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
16950 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
16951 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
16952 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16953 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
16954 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16955 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
16956 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
16957 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
16958 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
16959 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
16960 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
16961 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
16962 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
16963 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
16964 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
16965 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
16966 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
16967 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
16968 ; GFX11-NEXT: s_waitcnt vmcnt(0)
16969 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
16970 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16971 ; GFX11-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
16972 ; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18
16973 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16974 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
16975 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
16976 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
16977 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
16978 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
16979 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
16980 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
16981 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
16982 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
16983 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
16984 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
16985 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
16986 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
16987 ; GFX11-NEXT: s_setpc_b64 s[30:31]
16988 %op = fmul <32 x bfloat> %a, %b
16989 ret <32 x bfloat> %op
16992 define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
16993 ; GCN-LABEL: v_fdiv_bf16:
16995 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16996 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
16997 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
16998 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
16999 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17000 ; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17001 ; GCN-NEXT: v_rcp_f32_e32 v3, v2
17002 ; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17003 ; GCN-NEXT: v_fma_f32 v3, v4, v3, v3
17004 ; GCN-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
17005 ; GCN-NEXT: v_mul_f32_e32 v5, v4, v3
17006 ; GCN-NEXT: v_fma_f32 v6, -v2, v5, v4
17007 ; GCN-NEXT: v_fma_f32 v5, v6, v3, v5
17008 ; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4
17009 ; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5
17010 ; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17011 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17012 ; GCN-NEXT: s_setpc_b64 s[30:31]
17014 ; GFX7-LABEL: v_fdiv_bf16:
17016 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17017 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17018 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17019 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17020 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17021 ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17022 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2
17023 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17024 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3
17025 ; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
17026 ; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
17027 ; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
17028 ; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
17029 ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
17030 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
17031 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17032 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17033 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17035 ; GFX8-LABEL: v_fdiv_bf16:
17037 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17038 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17039 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17040 ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17041 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
17042 ; GFX8-NEXT: v_rcp_f32_e32 v4, v2
17043 ; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0
17044 ; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4
17045 ; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4
17046 ; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3
17047 ; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5
17048 ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
17049 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5
17050 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17051 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
17052 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
17053 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
17054 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
17055 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17056 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17057 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17058 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17060 ; GFX9-LABEL: v_fdiv_bf16:
17062 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17063 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17064 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17065 ; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
17066 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
17067 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17068 ; GFX9-NEXT: v_rcp_f32_e32 v4, v2
17069 ; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0
17070 ; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4
17071 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4
17072 ; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3
17073 ; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5
17074 ; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3
17075 ; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5
17076 ; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17077 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
17078 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
17079 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
17080 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17081 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17082 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17083 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17085 ; GFX10-LABEL: v_fdiv_bf16:
17087 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17088 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17089 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17090 ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0
17091 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
17092 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
17093 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17094 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
17095 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3
17096 ; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5
17097 ; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3
17098 ; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5
17099 ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4
17100 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17101 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
17102 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
17103 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17104 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17105 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17106 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17107 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17109 ; GFX11-LABEL: v_fdiv_bf16:
17111 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17112 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17113 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17114 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17115 ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
17116 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2
17117 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
17118 ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0
17119 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
17120 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3
17121 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
17122 ; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3
17123 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17124 ; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5
17125 ; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3
17126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17127 ; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5
17128 ; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4
17129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17130 ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0
17131 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
17132 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
17133 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17134 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17135 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17136 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17137 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17138 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17139 ; GFX11-NEXT: s_setpc_b64 s[30:31]
17140 %op = fdiv bfloat %a, %b
17144 declare bfloat @llvm.fabs.bf16(bfloat)
17146 define bfloat @v_fabs_bf16(bfloat %a) {
17147 ; GCN-LABEL: v_fabs_bf16:
17149 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17150 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17151 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17152 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17153 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17154 ; GCN-NEXT: s_setpc_b64 s[30:31]
17156 ; GFX7-LABEL: v_fabs_bf16:
17158 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17159 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17160 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17161 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17162 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17163 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17165 ; GFX8-LABEL: v_fabs_bf16:
17167 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17168 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17169 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17171 ; GFX9-LABEL: v_fabs_bf16:
17173 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17174 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17175 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17177 ; GFX10-LABEL: v_fabs_bf16:
17179 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17180 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17181 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17183 ; GFX11TRUE16-LABEL: v_fabs_bf16:
17184 ; GFX11TRUE16: ; %bb.0:
17185 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17186 ; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
17187 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17189 ; GFX11FAKE16-LABEL: v_fabs_bf16:
17190 ; GFX11FAKE16: ; %bb.0:
17191 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17192 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17193 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17194 %op = call bfloat @llvm.fabs.bf16(bfloat %a)
17198 define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
17199 ; GCN-LABEL: s_fabs_bf16:
17201 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
17202 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
17203 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
17204 ; GCN-NEXT: ; return to shader part epilog
17206 ; GFX7-LABEL: s_fabs_bf16:
17208 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
17209 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
17210 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
17211 ; GFX7-NEXT: ; return to shader part epilog
17213 ; GFX8-LABEL: s_fabs_bf16:
17215 ; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
17216 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
17217 ; GFX8-NEXT: ; return to shader part epilog
17219 ; GFX9-LABEL: s_fabs_bf16:
17221 ; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
17222 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
17223 ; GFX9-NEXT: ; return to shader part epilog
17225 ; GFX10-LABEL: s_fabs_bf16:
17227 ; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
17228 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
17229 ; GFX10-NEXT: ; return to shader part epilog
17231 ; GFX11-LABEL: s_fabs_bf16:
17233 ; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
17234 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17235 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
17236 ; GFX11-NEXT: ; return to shader part epilog
17237 %op = call bfloat @llvm.fabs.bf16(bfloat %a)
17238 %cast = bitcast bfloat %op to i16
17239 %zext = zext i16 %cast to i32
17240 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17244 define bfloat @v_fneg_bf16(bfloat %a) {
17245 ; GCN-LABEL: v_fneg_bf16:
17247 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17248 ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17249 ; GCN-NEXT: s_setpc_b64 s[30:31]
17251 ; GFX7-LABEL: v_fneg_bf16:
17253 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17254 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17255 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17257 ; GFX8-LABEL: v_fneg_bf16:
17259 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17260 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17261 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17263 ; GFX9-LABEL: v_fneg_bf16:
17265 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17266 ; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17267 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17269 ; GFX10-LABEL: v_fneg_bf16:
17271 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17272 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17273 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17275 ; GFX11TRUE16-LABEL: v_fneg_bf16:
17276 ; GFX11TRUE16: ; %bb.0:
17277 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17278 ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
17279 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17281 ; GFX11FAKE16-LABEL: v_fneg_bf16:
17282 ; GFX11FAKE16: ; %bb.0:
17283 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17284 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17285 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17286 %op = fneg bfloat %a
17290 declare i32 @llvm.amdgcn.readfirstlane(i32)
17292 ; FIXME: readfirstlane hack for other bugs
17293 define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
17294 ; GCN-LABEL: s_fneg_bf16:
17296 ; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0
17297 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17298 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
17299 ; GCN-NEXT: ; return to shader part epilog
17301 ; GFX7-LABEL: s_fneg_bf16:
17303 ; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0
17304 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17305 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
17306 ; GFX7-NEXT: ; return to shader part epilog
17308 ; GFX8-LABEL: s_fneg_bf16:
17310 ; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
17311 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
17312 ; GFX8-NEXT: ; return to shader part epilog
17314 ; GFX9-LABEL: s_fneg_bf16:
17316 ; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
17317 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
17318 ; GFX9-NEXT: ; return to shader part epilog
17320 ; GFX10-LABEL: s_fneg_bf16:
17322 ; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
17323 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
17324 ; GFX10-NEXT: ; return to shader part epilog
17326 ; GFX11-LABEL: s_fneg_bf16:
17328 ; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
17329 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17330 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
17331 ; GFX11-NEXT: ; return to shader part epilog
17332 %op = fneg bfloat %a
17333 %cast = bitcast bfloat %op to i16
17334 %zext = zext i16 %cast to i32
17335 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17339 define bfloat @v_fneg_fabs_bf16(bfloat %a) {
17340 ; GCN-LABEL: v_fneg_fabs_bf16:
17342 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17343 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17344 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17345 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17346 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17347 ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17348 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17349 ; GCN-NEXT: s_setpc_b64 s[30:31]
17351 ; GFX7-LABEL: v_fneg_fabs_bf16:
17353 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17354 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17355 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17356 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
17357 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17358 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
17359 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17360 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17362 ; GFX8-LABEL: v_fneg_fabs_bf16:
17364 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17365 ; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
17366 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17368 ; GFX9-LABEL: v_fneg_fabs_bf16:
17370 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17371 ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
17372 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17374 ; GFX10-LABEL: v_fneg_fabs_bf16:
17376 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17377 ; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
17378 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17380 ; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
17381 ; GFX11TRUE16: ; %bb.0:
17382 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17383 ; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
17384 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17386 ; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
17387 ; GFX11FAKE16: ; %bb.0:
17388 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17389 ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
17390 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17391 %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
17392 %op = fneg bfloat %fabs
17396 ; FIXME: readfirstlane hack for other bugs
17397 define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
17398 ; GCN-LABEL: s_fneg_fabs_bf16:
17400 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
17401 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
17402 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
17403 ; GCN-NEXT: s_bitset0_b32 s0, 31
17404 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
17405 ; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
17406 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
17407 ; GCN-NEXT: ; return to shader part epilog
17409 ; GFX7-LABEL: s_fneg_fabs_bf16:
17411 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
17412 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
17413 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
17414 ; GFX7-NEXT: s_bitset0_b32 s0, 31
17415 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
17416 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
17417 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
17418 ; GFX7-NEXT: ; return to shader part epilog
17420 ; GFX8-LABEL: s_fneg_fabs_bf16:
17422 ; GFX8-NEXT: s_bitset1_b32 s0, 15
17423 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
17424 ; GFX8-NEXT: ; return to shader part epilog
17426 ; GFX9-LABEL: s_fneg_fabs_bf16:
17428 ; GFX9-NEXT: s_bitset1_b32 s0, 15
17429 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
17430 ; GFX9-NEXT: ; return to shader part epilog
17432 ; GFX10-LABEL: s_fneg_fabs_bf16:
17434 ; GFX10-NEXT: s_bitset1_b32 s0, 15
17435 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
17436 ; GFX10-NEXT: ; return to shader part epilog
17438 ; GFX11-LABEL: s_fneg_fabs_bf16:
17440 ; GFX11-NEXT: s_bitset1_b32 s0, 15
17441 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17442 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
17443 ; GFX11-NEXT: ; return to shader part epilog
17444 %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
17445 %op = fneg bfloat %fabs
17446 %cast = bitcast bfloat %op to i16
17447 %zext = zext i16 %cast to i32
17448 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
17452 declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
17453 declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
17454 declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
17455 declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
17456 declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
17457 declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
17458 declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
17460 define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
17461 ; GCN-LABEL: v_minnum_bf16:
17463 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17464 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17465 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17466 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17467 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17468 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1
17469 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17470 ; GCN-NEXT: s_setpc_b64 s[30:31]
17472 ; GFX7-LABEL: v_minnum_bf16:
17474 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17475 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17476 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17477 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17478 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17479 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
17480 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17481 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17483 ; GFX8-LABEL: v_minnum_bf16:
17485 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17486 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17487 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17488 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
17489 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
17490 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
17491 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
17492 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
17493 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17494 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17495 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17496 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17498 ; GFX9-LABEL: v_minnum_bf16:
17500 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17501 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17502 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17503 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
17504 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
17505 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17506 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
17507 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
17508 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17509 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
17510 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17511 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17513 ; GFX10-LABEL: v_minnum_bf16:
17515 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17516 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17517 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17518 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
17519 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
17520 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
17521 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17522 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17523 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17524 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17525 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17527 ; GFX11-LABEL: v_minnum_bf16:
17529 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17530 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17531 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17532 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17533 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
17534 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
17535 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
17536 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17537 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17538 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
17539 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
17540 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17541 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17542 ; GFX11-NEXT: s_setpc_b64 s[30:31]
17543 %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
17547 define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
17548 ; GCN-LABEL: v_minnum_v2bf16:
17550 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17551 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17552 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
17553 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17554 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
17555 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17556 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17557 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17558 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17559 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3
17560 ; GCN-NEXT: v_min_f32_e32 v0, v0, v2
17561 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17562 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17563 ; GCN-NEXT: s_setpc_b64 s[30:31]
17565 ; GFX7-LABEL: v_minnum_v2bf16:
17567 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17568 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17569 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
17570 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17571 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
17572 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17573 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17574 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17575 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17576 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
17577 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
17578 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17579 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17580 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17582 ; GFX8-LABEL: v_minnum_v2bf16:
17584 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17585 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17586 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17587 ; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
17588 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
17589 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
17590 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17591 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17592 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
17593 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
17594 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
17595 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
17596 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
17597 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
17598 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
17599 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
17600 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
17601 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17602 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
17603 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17604 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
17605 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17607 ; GFX9-LABEL: v_minnum_v2bf16:
17609 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17610 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17611 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17612 ; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
17613 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17614 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17615 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
17616 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17617 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
17618 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
17619 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
17620 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
17621 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
17622 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
17623 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
17624 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
17625 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17626 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
17627 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
17628 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
17629 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17631 ; GFX10-LABEL: v_minnum_v2bf16:
17633 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17634 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17635 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17636 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17637 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17638 ; GFX10-NEXT: v_min_f32_e32 v2, v3, v2
17639 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
17640 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
17641 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
17642 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
17643 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
17644 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
17645 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
17646 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
17647 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
17648 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17649 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
17650 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
17651 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17653 ; GFX11-LABEL: v_minnum_v2bf16:
17655 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17656 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
17657 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17658 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
17659 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17660 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
17661 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
17662 ; GFX11-NEXT: v_min_f32_e32 v2, v3, v2
17663 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17664 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
17665 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
17666 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
17667 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
17668 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
17669 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
17670 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
17671 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
17672 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
17673 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17674 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
17675 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17676 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
17677 ; GFX11-NEXT: s_setpc_b64 s[30:31]
17678 %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
17679 ret <2 x bfloat> %op
17682 define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
17683 ; GCN-LABEL: v_minnum_v3bf16:
17685 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17686 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17687 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
17688 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17689 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
17690 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
17691 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
17692 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17693 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17694 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17695 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17696 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17697 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17698 ; GCN-NEXT: v_min_f32_e32 v2, v2, v5
17699 ; GCN-NEXT: v_min_f32_e32 v1, v1, v4
17700 ; GCN-NEXT: v_min_f32_e32 v0, v0, v3
17701 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17702 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17703 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17704 ; GCN-NEXT: s_setpc_b64 s[30:31]
17706 ; GFX7-LABEL: v_minnum_v3bf16:
17708 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17709 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17710 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
17711 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17712 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
17713 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
17714 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
17715 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17716 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17717 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17718 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17719 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17720 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17721 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
17722 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
17723 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
17724 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17725 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17726 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17727 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17729 ; GFX8-LABEL: v_minnum_v3bf16:
17731 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17732 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17733 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17734 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
17735 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
17736 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
17737 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
17738 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
17739 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17740 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17741 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
17742 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
17743 ; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
17744 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
17745 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
17746 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
17747 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17748 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17749 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
17750 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
17751 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
17752 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
17753 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
17754 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
17755 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
17756 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
17757 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
17758 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17759 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
17760 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17761 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
17762 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
17763 ; GFX8-NEXT: s_setpc_b64 s[30:31]
17765 ; GFX9-LABEL: v_minnum_v3bf16:
17767 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17768 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17769 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17770 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
17771 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
17772 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
17773 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
17774 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
17775 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17776 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17777 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
17778 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
17779 ; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
17780 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17781 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17782 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
17783 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
17784 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
17785 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
17786 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
17787 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
17788 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
17789 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
17790 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
17791 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17792 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
17793 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
17794 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
17795 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
17796 ; GFX9-NEXT: s_setpc_b64 s[30:31]
17798 ; GFX10-LABEL: v_minnum_v3bf16:
17800 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17801 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
17802 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17803 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17804 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17805 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17806 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17807 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
17808 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
17809 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
17810 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
17811 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
17812 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
17813 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
17814 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
17815 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
17816 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
17817 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
17818 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
17819 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17820 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17821 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17822 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17823 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
17824 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
17825 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17826 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
17827 ; GFX10-NEXT: s_setpc_b64 s[30:31]
17829 ; GFX11TRUE16-LABEL: v_minnum_v3bf16:
17830 ; GFX11TRUE16: ; %bb.0:
17831 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17832 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
17833 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17834 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17835 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17836 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17837 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17838 ; GFX11TRUE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
17839 ; GFX11TRUE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
17840 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
17841 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
17842 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
17843 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
17844 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
17845 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
17846 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
17847 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
17848 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
17849 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
17850 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17851 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17852 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17853 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17854 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17855 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
17856 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
17857 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17858 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
17859 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
17860 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17862 ; GFX11FAKE16-LABEL: v_minnum_v3bf16:
17863 ; GFX11FAKE16: ; %bb.0:
17864 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17865 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
17866 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17867 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
17868 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17869 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17870 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17871 ; GFX11FAKE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
17872 ; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
17873 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
17874 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
17875 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
17876 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
17877 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
17878 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
17879 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
17880 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
17881 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
17882 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
17883 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17884 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
17885 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
17886 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17887 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
17888 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
17889 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
17890 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
17891 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
17892 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
17893 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
17894 %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
17895 ret <3 x bfloat> %op
17898 define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
17899 ; GCN-LABEL: v_minnum_v4bf16:
17901 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17902 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
17903 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
17904 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
17905 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
17906 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
17907 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
17908 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
17909 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
17910 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
17911 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17912 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
17913 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17914 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17915 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17916 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17917 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17918 ; GCN-NEXT: v_min_f32_e32 v3, v3, v7
17919 ; GCN-NEXT: v_min_f32_e32 v2, v2, v6
17920 ; GCN-NEXT: v_min_f32_e32 v1, v1, v5
17921 ; GCN-NEXT: v_min_f32_e32 v0, v0, v4
17922 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17923 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17924 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17925 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17926 ; GCN-NEXT: s_setpc_b64 s[30:31]
17928 ; GFX7-LABEL: v_minnum_v4bf16:
17930 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17931 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
17932 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
17933 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
17934 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
17935 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
17936 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
17937 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
17938 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
17939 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
17940 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17941 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
17942 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17943 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
17944 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17945 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
17946 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17947 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
17948 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
17949 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
17950 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
17951 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17952 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17953 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17954 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17955 ; GFX7-NEXT: s_setpc_b64 s[30:31]
17957 ; GFX8-LABEL: v_minnum_v4bf16:
17959 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17960 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
17961 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
17962 ; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
17963 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
17964 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
17965 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
17966 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
17967 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
17968 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
17969 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
17970 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
17971 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
17972 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
17973 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
17974 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
17975 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
17976 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
17977 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17978 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
17979 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
17980 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
17981 ; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
17982 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
17983 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
17984 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
17985 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
17986 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
17987 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
17988 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
17989 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
17990 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
17991 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
17992 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
17993 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
17994 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
17995 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
17996 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
17997 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
17998 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
17999 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
18000 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
18001 ; GFX8-NEXT: s_setpc_b64 s[30:31]
18003 ; GFX9-LABEL: v_minnum_v4bf16:
18005 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18006 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
18007 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
18008 ; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
18009 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18010 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18011 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
18012 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
18013 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
18014 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
18015 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
18016 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
18017 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
18018 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
18019 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
18020 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
18021 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18022 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
18023 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
18024 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
18025 ; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
18026 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18027 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18028 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
18029 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
18030 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
18031 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
18032 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18033 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
18034 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
18035 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
18036 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
18037 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18038 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
18039 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
18040 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
18041 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
18042 ; GFX9-NEXT: s_setpc_b64 s[30:31]
18044 ; GFX10-LABEL: v_minnum_v4bf16:
18046 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18047 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
18048 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
18049 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18050 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18051 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
18052 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
18053 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
18054 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18055 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18056 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
18057 ; GFX10-NEXT: v_min_f32_e32 v3, v7, v6
18058 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
18059 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
18060 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
18061 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
18062 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
18063 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
18064 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
18065 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
18066 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
18067 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
18068 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
18069 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
18070 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18071 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
18072 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
18073 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
18074 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
18075 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18076 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
18077 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18078 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
18079 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
18080 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18081 ; GFX10-NEXT: s_setpc_b64 s[30:31]
18083 ; GFX11-LABEL: v_minnum_v4bf16:
18085 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18086 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
18087 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
18088 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18089 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18090 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
18091 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
18092 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18093 ; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
18094 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18095 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
18096 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18097 ; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
18098 ; GFX11-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
18099 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
18100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
18101 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
18102 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
18103 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
18104 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
18105 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
18106 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
18107 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
18108 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
18109 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
18110 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
18111 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
18112 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18113 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
18114 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
18115 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
18116 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18117 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18118 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
18119 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18120 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
18121 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
18122 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
18123 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18124 ; GFX11-NEXT: s_setpc_b64 s[30:31]
18125 %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
18126 ret <4 x bfloat> %op
18129 define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
18130 ; GCN-LABEL: v_minnum_v8bf16:
18132 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18133 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18134 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
18135 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
18136 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
18137 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
18138 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
18139 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
18140 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
18141 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
18142 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
18143 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
18144 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
18145 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
18146 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
18147 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
18148 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
18149 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18150 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18151 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18152 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18153 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18154 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18155 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18156 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18157 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18158 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18159 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18160 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18161 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18162 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18163 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18164 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18165 ; GCN-NEXT: v_min_f32_e32 v7, v7, v15
18166 ; GCN-NEXT: v_min_f32_e32 v6, v6, v14
18167 ; GCN-NEXT: v_min_f32_e32 v5, v5, v13
18168 ; GCN-NEXT: v_min_f32_e32 v4, v4, v12
18169 ; GCN-NEXT: v_min_f32_e32 v3, v3, v11
18170 ; GCN-NEXT: v_min_f32_e32 v2, v2, v10
18171 ; GCN-NEXT: v_min_f32_e32 v1, v1, v9
18172 ; GCN-NEXT: v_min_f32_e32 v0, v0, v8
18173 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18174 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18175 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18176 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18177 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18178 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18179 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18180 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18181 ; GCN-NEXT: s_setpc_b64 s[30:31]
18183 ; GFX7-LABEL: v_minnum_v8bf16:
18185 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18186 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18187 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
18188 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
18189 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
18190 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
18191 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
18192 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
18193 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
18194 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
18195 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
18196 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
18197 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
18198 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
18199 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
18200 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
18201 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
18202 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18203 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18204 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18205 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18206 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18207 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18208 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18209 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18210 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18211 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18212 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18213 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18214 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18215 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18216 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18217 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18218 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
18219 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
18220 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
18221 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v12
18222 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v11
18223 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v10
18224 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
18225 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
18226 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18227 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18228 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18229 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18230 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18231 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18232 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18233 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18234 ; GFX7-NEXT: s_setpc_b64 s[30:31]
18236 ; GFX8-LABEL: v_minnum_v8bf16:
18238 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18239 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18240 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18241 ; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
18242 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
18243 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
18244 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18245 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18246 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
18247 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v7
18248 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
18249 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
18250 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
18251 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
18252 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
18253 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
18254 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
18255 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
18256 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18257 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
18258 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
18259 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
18260 ; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
18261 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
18262 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
18263 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18264 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18265 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18266 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v6
18267 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
18268 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18269 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
18270 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
18271 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
18272 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
18273 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
18274 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
18275 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
18276 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18277 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
18278 ; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
18279 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
18280 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
18281 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18282 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18283 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18284 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v5
18285 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
18286 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18287 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
18288 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
18289 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
18290 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
18291 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
18292 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18293 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
18294 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
18295 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
18296 ; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
18297 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
18298 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
18299 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18300 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18301 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18302 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v4
18303 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
18304 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18305 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
18306 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
18307 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
18308 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
18309 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
18310 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18311 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
18312 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
18313 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
18314 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
18315 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
18316 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
18317 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
18318 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
18319 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
18320 ; GFX8-NEXT: s_setpc_b64 s[30:31]
18322 ; GFX9-LABEL: v_minnum_v8bf16:
18324 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18325 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18326 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18327 ; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
18328 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18329 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18330 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
18331 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
18332 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
18333 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
18334 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
18335 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
18336 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
18337 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
18338 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
18339 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
18340 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18341 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
18342 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
18343 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
18344 ; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
18345 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18346 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18347 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
18348 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v6
18349 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
18350 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
18351 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18352 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
18353 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
18354 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
18355 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
18356 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
18357 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
18358 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18359 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
18360 ; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
18361 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18362 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18363 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
18364 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
18365 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
18366 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
18367 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18368 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
18369 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
18370 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
18371 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
18372 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18373 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
18374 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
18375 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
18376 ; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
18377 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18378 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18379 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
18380 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
18381 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
18382 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
18383 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18384 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
18385 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
18386 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
18387 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
18388 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18389 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
18390 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
18391 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
18392 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
18393 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
18394 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
18395 ; GFX9-NEXT: s_setpc_b64 s[30:31]
18397 ; GFX10-LABEL: v_minnum_v8bf16:
18399 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18400 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18401 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18402 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18403 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18404 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
18405 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18406 ; GFX10-NEXT: v_min_f32_e32 v8, v9, v8
18407 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
18408 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18409 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
18410 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
18411 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
18412 ; GFX10-NEXT: v_min_f32_e32 v7, v10, v9
18413 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
18414 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
18415 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
18416 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
18417 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
18418 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
18419 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18420 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
18421 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
18422 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
18423 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
18424 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
18425 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
18426 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
18427 ; GFX10-NEXT: v_min_f32_e32 v6, v10, v6
18428 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
18429 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18430 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18431 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
18432 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
18433 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
18434 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
18435 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18436 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18437 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
18438 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v5
18439 ; GFX10-NEXT: v_min_f32_e32 v5, v15, v13
18440 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
18441 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
18442 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
18443 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
18444 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
18445 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
18446 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
18447 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
18448 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
18449 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
18450 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
18451 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
18452 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
18453 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
18454 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
18455 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
18456 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
18457 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
18458 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
18459 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18460 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
18461 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18462 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
18463 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
18464 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18465 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18466 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
18467 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
18468 ; GFX10-NEXT: s_setpc_b64 s[30:31]
18470 ; GFX11-LABEL: v_minnum_v8bf16:
18472 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18473 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
18474 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
18475 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18476 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
18477 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
18478 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18479 ; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
18480 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
18481 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
18482 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18483 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
18484 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18485 ; GFX11-NEXT: v_min_f32_e32 v3, v3, v7
18486 ; GFX11-NEXT: v_min_f32_e32 v7, v10, v9
18487 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
18488 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
18489 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18490 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
18491 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
18492 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
18493 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
18494 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
18495 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
18496 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
18497 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
18498 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
18499 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18500 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
18501 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18502 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
18503 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
18504 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
18505 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18506 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
18507 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18508 ; GFX11-NEXT: v_min_f32_e32 v6, v10, v6
18509 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
18510 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
18511 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
18512 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
18513 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
18514 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
18515 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
18516 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
18517 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
18518 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18519 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18520 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
18521 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
18522 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v4
18523 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
18524 ; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
18525 ; GFX11-NEXT: v_min_f32_e32 v5, v15, v13
18526 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18527 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
18528 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
18529 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
18530 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
18531 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
18532 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
18533 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
18534 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
18535 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
18536 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
18537 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
18538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
18539 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
18540 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
18541 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
18542 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18543 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
18544 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
18545 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
18546 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
18547 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
18548 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
18549 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
18550 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
18551 ; GFX11-NEXT: s_setpc_b64 s[30:31]
18552 %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
18553 ret <8 x bfloat> %op
18556 define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
18557 ; GCN-LABEL: v_minnum_v16bf16:
18559 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18560 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
18561 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
18562 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
18563 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18564 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30
18565 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
18566 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
18567 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
18568 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18569 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29
18570 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
18571 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
18572 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
18573 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18574 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28
18575 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
18576 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
18577 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
18578 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18579 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27
18580 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
18581 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
18582 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
18583 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18584 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26
18585 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
18586 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
18587 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
18588 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18589 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25
18590 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
18591 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
18592 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
18593 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18594 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24
18595 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
18596 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
18597 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
18598 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18599 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23
18600 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
18601 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
18602 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
18603 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18604 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22
18605 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
18606 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
18607 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
18608 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18609 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21
18610 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
18611 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
18612 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
18613 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
18614 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
18615 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
18616 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
18617 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
18618 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
18619 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
18620 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
18621 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
18622 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18623 ; GCN-NEXT: v_min_f32_e32 v4, v4, v20
18624 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
18625 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18626 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
18627 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18628 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
18629 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18630 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
18631 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18632 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
18633 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18634 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19
18635 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18
18636 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17
18637 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16
18638 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18639 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18640 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18641 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18642 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18643 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18644 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18645 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18646 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18647 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18648 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18649 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18650 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18651 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18652 ; GCN-NEXT: s_waitcnt vmcnt(0)
18653 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
18654 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
18655 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16
18656 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18657 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18658 ; GCN-NEXT: s_setpc_b64 s[30:31]
18660 ; GFX7-LABEL: v_minnum_v16bf16:
18662 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18663 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
18664 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
18665 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
18666 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18667 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
18668 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
18669 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
18670 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
18671 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
18672 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
18673 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
18674 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
18675 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
18676 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
18677 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
18678 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
18679 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
18680 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
18681 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
18682 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
18683 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
18684 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
18685 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
18686 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
18687 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
18688 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
18689 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
18690 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
18691 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
18692 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
18693 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
18694 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
18695 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
18696 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
18697 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
18698 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
18699 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18700 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
18701 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18702 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
18703 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18704 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
18705 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18706 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
18707 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18708 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
18709 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18710 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
18711 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18712 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
18713 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18714 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18715 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
18716 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18717 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
18718 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18719 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
18720 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18721 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
18722 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18723 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
18724 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18725 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
18726 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18727 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
18728 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
18729 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
18730 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
18731 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
18732 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
18733 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
18734 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
18735 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
18736 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
18737 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
18738 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
18739 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
18740 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
18741 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18742 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18743 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18744 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18745 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18746 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18747 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18748 ; GFX7-NEXT: s_waitcnt vmcnt(0)
18749 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
18750 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
18751 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
18752 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18753 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18754 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18755 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18756 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18757 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18758 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18759 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18760 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18761 ; GFX7-NEXT: s_setpc_b64 s[30:31]
18763 ; GFX8-LABEL: v_minnum_v16bf16:
18765 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18766 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
18767 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
18768 ; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
18769 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
18770 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
18771 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
18772 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18773 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18774 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18775 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
18776 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
18777 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
18778 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
18779 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
18780 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
18781 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
18782 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
18783 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18784 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
18785 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
18786 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
18787 ; GFX8-NEXT: v_min_f32_e32 v15, v17, v15
18788 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
18789 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
18790 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18791 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18792 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18793 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v14
18794 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
18795 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
18796 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
18797 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
18798 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
18799 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
18800 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
18801 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18802 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
18803 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
18804 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
18805 ; GFX8-NEXT: v_min_f32_e32 v14, v17, v14
18806 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
18807 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
18808 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18809 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18810 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18811 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v13
18812 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
18813 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
18814 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
18815 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
18816 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
18817 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
18818 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
18819 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18820 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
18821 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
18822 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
18823 ; GFX8-NEXT: v_min_f32_e32 v13, v17, v13
18824 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
18825 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
18826 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18827 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18828 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18829 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v12
18830 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
18831 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
18832 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
18833 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
18834 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
18835 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
18836 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
18837 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
18838 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
18839 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
18840 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
18841 ; GFX8-NEXT: v_min_f32_e32 v12, v17, v12
18842 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
18843 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
18844 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
18845 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
18846 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18847 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v11
18848 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
18849 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
18850 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
18851 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
18852 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
18853 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
18854 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
18855 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
18856 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
18857 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
18858 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
18859 ; GFX8-NEXT: v_min_f32_e32 v11, v17, v11
18860 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
18861 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
18862 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
18863 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
18864 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18865 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v10
18866 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
18867 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
18868 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
18869 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
18870 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
18871 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
18872 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
18873 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
18874 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
18875 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
18876 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
18877 ; GFX8-NEXT: v_min_f32_e32 v10, v17, v10
18878 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
18879 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
18880 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
18881 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
18882 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18883 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v9
18884 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
18885 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
18886 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
18887 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
18888 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
18889 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
18890 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
18891 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18892 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
18893 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
18894 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
18895 ; GFX8-NEXT: v_min_f32_e32 v9, v17, v9
18896 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
18897 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
18898 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
18899 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
18900 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
18901 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v8
18902 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
18903 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
18904 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
18905 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
18906 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
18907 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
18908 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
18909 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
18910 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
18911 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
18912 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
18913 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
18914 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
18915 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
18916 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
18917 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
18918 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
18919 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
18920 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
18921 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
18922 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
18923 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
18924 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
18925 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
18926 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
18927 ; GFX8-NEXT: s_setpc_b64 s[30:31]
18929 ; GFX9-LABEL: v_minnum_v16bf16:
18931 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18932 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
18933 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
18934 ; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
18935 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
18936 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
18937 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
18938 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
18939 ; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
18940 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
18941 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
18942 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
18943 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
18944 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
18945 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
18946 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
18947 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
18948 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
18949 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
18950 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
18951 ; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
18952 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
18953 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
18954 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
18955 ; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
18956 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
18957 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
18958 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
18959 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
18960 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
18961 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
18962 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
18963 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
18964 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
18965 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
18966 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
18967 ; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
18968 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
18969 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
18970 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
18971 ; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
18972 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
18973 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
18974 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
18975 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
18976 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
18977 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
18978 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
18979 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
18980 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
18981 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
18982 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
18983 ; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
18984 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
18985 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
18986 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
18987 ; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
18988 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
18989 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
18990 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
18991 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
18992 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
18993 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
18994 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
18995 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
18996 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
18997 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
18998 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
18999 ; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
19000 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19001 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19002 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
19003 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
19004 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
19005 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
19006 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
19007 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
19008 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
19009 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
19010 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
19011 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
19012 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
19013 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
19014 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
19015 ; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
19016 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19017 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19018 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
19019 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
19020 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
19021 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
19022 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
19023 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
19024 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
19025 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
19026 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
19027 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
19028 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
19029 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
19030 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
19031 ; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
19032 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19033 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19034 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
19035 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
19036 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
19037 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
19038 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
19039 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
19040 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
19041 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
19042 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
19043 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
19044 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
19045 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
19046 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
19047 ; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
19048 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19049 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19050 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
19051 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
19052 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
19053 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
19054 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
19055 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
19056 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
19057 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
19058 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
19059 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
19060 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
19061 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
19062 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
19063 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
19064 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
19065 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
19066 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
19067 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
19068 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
19069 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
19070 ; GFX9-NEXT: s_setpc_b64 s[30:31]
19072 ; GFX10-LABEL: v_minnum_v16bf16:
19074 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19075 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
19076 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
19077 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19078 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19079 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
19080 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19081 ; GFX10-NEXT: v_min_f32_e32 v16, v17, v16
19082 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
19083 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
19084 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19085 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
19086 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
19087 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
19088 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
19089 ; GFX10-NEXT: v_min_f32_e32 v17, v18, v17
19090 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
19091 ; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
19092 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
19093 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
19094 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
19095 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
19096 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
19097 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
19098 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
19099 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
19100 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19101 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
19102 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
19103 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
19104 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19105 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19106 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
19107 ; GFX10-NEXT: v_min_f32_e32 v17, v20, v19
19108 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
19109 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v13
19110 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
19111 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
19112 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
19113 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
19114 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
19115 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
19116 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
19117 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19118 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19119 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
19120 ; GFX10-NEXT: v_min_f32_e32 v13, v19, v18
19121 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
19122 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
19123 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19124 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
19125 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
19126 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
19127 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
19128 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
19129 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
19130 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
19131 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
19132 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
19133 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19134 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
19135 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
19136 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19137 ; GFX10-NEXT: v_min_f32_e32 v12, v18, v12
19138 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
19139 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
19140 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
19141 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
19142 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
19143 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
19144 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
19145 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
19146 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
19147 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
19148 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19149 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
19150 ; GFX10-NEXT: v_min_f32_e32 v18, v19, v18
19151 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19152 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
19153 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
19154 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
19155 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
19156 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
19157 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
19158 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
19159 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
19160 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
19161 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
19162 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19163 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
19164 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
19165 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
19166 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
19167 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19168 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
19169 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
19170 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
19171 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
19172 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
19173 ; GFX10-NEXT: v_min_f32_e32 v19, v22, v20
19174 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
19175 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
19176 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19177 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19178 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
19179 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
19180 ; GFX10-NEXT: v_min_f32_e32 v9, v22, v20
19181 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
19182 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v8
19183 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
19184 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
19185 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
19186 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
19187 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
19188 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
19189 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
19190 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
19191 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
19192 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
19193 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
19194 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
19195 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
19196 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
19197 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
19198 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
19199 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
19200 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
19201 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
19202 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
19203 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
19204 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
19205 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
19206 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
19207 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
19208 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
19209 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
19210 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
19211 ; GFX10-NEXT: s_setpc_b64 s[30:31]
19213 ; GFX11-LABEL: v_minnum_v16bf16:
19215 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19216 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
19217 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
19218 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19219 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
19220 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
19221 ; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
19222 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
19223 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19224 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
19225 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
19226 ; GFX11-NEXT: v_min_f32_e32 v17, v18, v17
19227 ; GFX11-NEXT: v_min_f32_e32 v6, v6, v14
19228 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
19229 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
19230 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
19231 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
19232 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19233 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
19234 ; GFX11-NEXT: v_min_f32_e32 v7, v7, v15
19235 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
19236 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
19237 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
19238 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
19239 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
19240 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
19241 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
19242 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
19243 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
19244 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
19245 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
19246 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
19247 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
19248 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19249 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
19250 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
19251 ; GFX11-NEXT: v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
19252 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
19253 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
19254 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
19255 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19256 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19257 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19258 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
19259 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
19260 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
19261 ; GFX11-NEXT: v_min_f32_e32 v4, v4, v12
19262 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
19263 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19264 ; GFX11-NEXT: v_min_f32_e32 v5, v5, v13
19265 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
19266 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
19267 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
19268 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
19269 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
19270 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
19271 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
19272 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
19273 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
19274 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
19275 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
19276 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
19277 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
19278 ; GFX11-NEXT: v_min_f32_e32 v12, v18, v12
19279 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
19280 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
19281 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
19282 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
19283 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
19284 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19285 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
19286 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
19287 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
19288 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
19289 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
19290 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
19291 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
19292 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
19293 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
19294 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
19295 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
19296 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19297 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19298 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
19299 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
19300 ; GFX11-NEXT: v_min_f32_e32 v18, v19, v18
19301 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
19302 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
19303 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19304 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19305 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
19306 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
19307 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19308 ; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
19309 ; GFX11-NEXT: v_min_f32_e32 v3, v3, v11
19310 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
19311 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
19312 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
19313 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
19314 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
19315 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
19316 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
19317 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
19318 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
19319 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
19320 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
19321 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
19322 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
19323 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
19324 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
19325 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
19326 ; GFX11-NEXT: v_min_f32_e32 v19, v22, v20
19327 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
19328 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
19329 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19330 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
19331 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
19332 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19333 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19334 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
19335 ; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9
19336 ; GFX11-NEXT: v_min_f32_e32 v9, v22, v20
19337 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
19338 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
19339 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
19340 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
19341 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
19342 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
19343 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
19344 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
19345 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
19346 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
19347 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
19348 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
19349 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
19350 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
19351 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
19352 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
19353 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
19354 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
19355 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
19356 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
19357 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
19358 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
19359 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
19360 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
19361 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
19362 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
19363 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
19364 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
19365 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
19366 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
19367 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
19368 ; GFX11-NEXT: s_setpc_b64 s[30:31]
19369 %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
19370 ret <16 x bfloat> %op
19373 define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
19374 ; GCN-LABEL: v_minnum_v32bf16:
19376 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19377 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
19378 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
19379 ; GCN-NEXT: s_waitcnt vmcnt(1)
19380 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
19381 ; GCN-NEXT: s_waitcnt vmcnt(0)
19382 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
19383 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19384 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19385 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
19386 ; GCN-NEXT: v_min_f32_e32 v31, v31, v32
19387 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
19388 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19389 ; GCN-NEXT: s_waitcnt vmcnt(0)
19390 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19391 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19392 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
19393 ; GCN-NEXT: v_min_f32_e32 v30, v30, v32
19394 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
19395 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19396 ; GCN-NEXT: s_waitcnt vmcnt(0)
19397 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19398 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19399 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
19400 ; GCN-NEXT: v_min_f32_e32 v29, v29, v32
19401 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
19402 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19403 ; GCN-NEXT: s_waitcnt vmcnt(0)
19404 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19405 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19406 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
19407 ; GCN-NEXT: v_min_f32_e32 v28, v28, v32
19408 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
19409 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19410 ; GCN-NEXT: s_waitcnt vmcnt(0)
19411 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19412 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19413 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
19414 ; GCN-NEXT: v_min_f32_e32 v27, v27, v32
19415 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
19416 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19417 ; GCN-NEXT: s_waitcnt vmcnt(0)
19418 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19419 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19420 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
19421 ; GCN-NEXT: v_min_f32_e32 v26, v26, v32
19422 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
19423 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19424 ; GCN-NEXT: s_waitcnt vmcnt(0)
19425 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19426 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19427 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
19428 ; GCN-NEXT: v_min_f32_e32 v25, v25, v32
19429 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
19430 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19431 ; GCN-NEXT: s_waitcnt vmcnt(0)
19432 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19433 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19434 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
19435 ; GCN-NEXT: v_min_f32_e32 v24, v24, v32
19436 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
19437 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19438 ; GCN-NEXT: s_waitcnt vmcnt(0)
19439 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19440 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19441 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
19442 ; GCN-NEXT: v_min_f32_e32 v23, v23, v32
19443 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
19444 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19445 ; GCN-NEXT: s_waitcnt vmcnt(0)
19446 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19447 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19448 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
19449 ; GCN-NEXT: v_min_f32_e32 v22, v22, v32
19450 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
19451 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19452 ; GCN-NEXT: s_waitcnt vmcnt(0)
19453 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19454 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19455 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
19456 ; GCN-NEXT: v_min_f32_e32 v21, v21, v32
19457 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
19458 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19459 ; GCN-NEXT: s_waitcnt vmcnt(0)
19460 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19461 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19462 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
19463 ; GCN-NEXT: v_min_f32_e32 v20, v20, v32
19464 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
19465 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19466 ; GCN-NEXT: s_waitcnt vmcnt(0)
19467 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19468 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19469 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
19470 ; GCN-NEXT: v_min_f32_e32 v19, v19, v32
19471 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
19472 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19473 ; GCN-NEXT: s_waitcnt vmcnt(0)
19474 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19475 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19476 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
19477 ; GCN-NEXT: v_min_f32_e32 v18, v18, v32
19478 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
19479 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19480 ; GCN-NEXT: s_waitcnt vmcnt(0)
19481 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19482 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19483 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
19484 ; GCN-NEXT: v_min_f32_e32 v17, v17, v32
19485 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
19486 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19487 ; GCN-NEXT: s_waitcnt vmcnt(0)
19488 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19489 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19490 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
19491 ; GCN-NEXT: v_min_f32_e32 v16, v16, v32
19492 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
19493 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19494 ; GCN-NEXT: s_waitcnt vmcnt(0)
19495 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19496 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19497 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
19498 ; GCN-NEXT: v_min_f32_e32 v15, v15, v32
19499 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
19500 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19501 ; GCN-NEXT: s_waitcnt vmcnt(0)
19502 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19503 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19504 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
19505 ; GCN-NEXT: v_min_f32_e32 v14, v14, v32
19506 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
19507 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19508 ; GCN-NEXT: s_waitcnt vmcnt(0)
19509 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19510 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19511 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
19512 ; GCN-NEXT: v_min_f32_e32 v13, v13, v32
19513 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
19514 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19515 ; GCN-NEXT: s_waitcnt vmcnt(0)
19516 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19517 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19518 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
19519 ; GCN-NEXT: v_min_f32_e32 v12, v12, v32
19520 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
19521 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19522 ; GCN-NEXT: s_waitcnt vmcnt(0)
19523 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19524 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19525 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
19526 ; GCN-NEXT: v_min_f32_e32 v11, v11, v32
19527 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
19528 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19529 ; GCN-NEXT: s_waitcnt vmcnt(0)
19530 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19531 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19532 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
19533 ; GCN-NEXT: v_min_f32_e32 v10, v10, v32
19534 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
19535 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19536 ; GCN-NEXT: s_waitcnt vmcnt(0)
19537 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19538 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19539 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
19540 ; GCN-NEXT: v_min_f32_e32 v9, v9, v32
19541 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
19542 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19543 ; GCN-NEXT: s_waitcnt vmcnt(0)
19544 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19545 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19546 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
19547 ; GCN-NEXT: v_min_f32_e32 v8, v8, v32
19548 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
19549 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19550 ; GCN-NEXT: s_waitcnt vmcnt(0)
19551 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19552 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19553 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
19554 ; GCN-NEXT: v_min_f32_e32 v7, v7, v32
19555 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
19556 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19557 ; GCN-NEXT: s_waitcnt vmcnt(0)
19558 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19559 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19560 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
19561 ; GCN-NEXT: v_min_f32_e32 v6, v6, v32
19562 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
19563 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19564 ; GCN-NEXT: s_waitcnt vmcnt(0)
19565 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19566 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19567 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
19568 ; GCN-NEXT: v_min_f32_e32 v5, v5, v32
19569 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
19570 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19571 ; GCN-NEXT: s_waitcnt vmcnt(0)
19572 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19573 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19574 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
19575 ; GCN-NEXT: v_min_f32_e32 v4, v4, v32
19576 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
19577 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19578 ; GCN-NEXT: s_waitcnt vmcnt(0)
19579 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19580 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19581 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
19582 ; GCN-NEXT: v_min_f32_e32 v3, v3, v32
19583 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
19584 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19585 ; GCN-NEXT: s_waitcnt vmcnt(0)
19586 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19587 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19588 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
19589 ; GCN-NEXT: v_min_f32_e32 v2, v2, v32
19590 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
19591 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19592 ; GCN-NEXT: s_waitcnt vmcnt(0)
19593 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19594 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19595 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
19596 ; GCN-NEXT: v_min_f32_e32 v1, v1, v32
19597 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
19598 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19599 ; GCN-NEXT: s_waitcnt vmcnt(0)
19600 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
19601 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19602 ; GCN-NEXT: v_min_f32_e32 v0, v0, v32
19603 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19604 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19605 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19606 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19607 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19608 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19609 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19610 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19611 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19612 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19613 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19614 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19615 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19616 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19617 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19618 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19619 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19620 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19621 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19622 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19623 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19624 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19625 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19626 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19627 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19628 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19629 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19630 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19631 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19632 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19633 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19634 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19635 ; GCN-NEXT: s_setpc_b64 s[30:31]
19637 ; GFX7-LABEL: v_minnum_v32bf16:
19639 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19640 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
19641 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
19642 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
19643 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19644 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
19645 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19646 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
19647 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19648 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
19649 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19650 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
19651 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19652 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
19653 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19654 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
19655 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19656 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
19657 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19658 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
19659 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19660 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
19661 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19662 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
19663 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19664 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
19665 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19666 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
19667 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19668 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
19669 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19670 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
19671 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19672 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
19673 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19674 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
19675 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19676 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
19677 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19678 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
19679 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19680 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
19681 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19682 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
19683 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19684 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
19685 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19686 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
19687 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19688 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
19689 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19690 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
19691 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19692 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
19693 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19694 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
19695 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19696 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
19697 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19698 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
19699 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19700 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
19701 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19702 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
19703 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19704 ; GFX7-NEXT: s_waitcnt vmcnt(1)
19705 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
19706 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19707 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19708 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19709 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19710 ; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
19711 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
19712 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
19713 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19714 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19715 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19716 ; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
19717 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
19718 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19719 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19720 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19721 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19722 ; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
19723 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
19724 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19725 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19726 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19727 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19728 ; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
19729 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
19730 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19731 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19732 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19733 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19734 ; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
19735 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
19736 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19737 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19738 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19739 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19740 ; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
19741 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
19742 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
19743 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19744 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19745 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19746 ; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
19747 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
19748 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
19749 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19750 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19751 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19752 ; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
19753 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
19754 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
19755 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19756 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19757 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19758 ; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
19759 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
19760 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
19761 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19762 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19763 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19764 ; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
19765 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
19766 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
19767 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19768 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19769 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19770 ; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
19771 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
19772 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
19773 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19774 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19775 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19776 ; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
19777 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
19778 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
19779 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19780 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19781 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19782 ; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
19783 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
19784 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
19785 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19786 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19787 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19788 ; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
19789 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
19790 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
19791 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19792 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19793 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19794 ; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
19795 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
19796 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
19797 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19798 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19799 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19800 ; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
19801 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
19802 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
19803 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19804 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19805 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19806 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
19807 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
19808 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19809 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19810 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19811 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19812 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
19813 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
19814 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19815 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19816 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19817 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19818 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
19819 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
19820 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19821 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19822 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19823 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19824 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
19825 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
19826 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19827 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19828 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19829 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19830 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
19831 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
19832 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19833 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19834 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19835 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19836 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
19837 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
19838 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
19839 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19840 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19841 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19842 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
19843 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
19844 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
19845 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19846 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19847 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19848 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
19849 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
19850 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
19851 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19852 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19853 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19854 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
19855 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
19856 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
19857 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19858 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19859 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19860 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
19861 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
19862 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
19863 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19864 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19865 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19866 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
19867 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
19868 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
19869 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19870 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19871 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19872 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
19873 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
19874 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
19875 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19876 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19877 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19878 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
19879 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
19880 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
19881 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19882 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19883 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19884 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
19885 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
19886 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
19887 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19888 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19889 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19890 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
19891 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
19892 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
19893 ; GFX7-NEXT: s_waitcnt vmcnt(0)
19894 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
19895 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
19896 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
19897 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
19898 ; GFX7-NEXT: s_setpc_b64 s[30:31]
19900 ; GFX8-LABEL: v_minnum_v32bf16:
19902 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19903 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
19904 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
19905 ; GFX8-NEXT: v_min_f32_e32 v31, v32, v31
19906 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
19907 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
19908 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
19909 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19910 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
19911 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
19912 ; GFX8-NEXT: v_min_f32_e32 v14, v14, v30
19913 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
19914 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
19915 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
19916 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
19917 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
19918 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
19919 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
19920 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
19921 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
19922 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
19923 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
19924 ; GFX8-NEXT: v_min_f32_e32 v32, v32, v30
19925 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
19926 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
19927 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
19928 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
19929 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
19930 ; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
19931 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
19932 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
19933 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
19934 ; GFX8-NEXT: s_waitcnt vmcnt(0)
19935 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
19936 ; GFX8-NEXT: v_min_f32_e32 v33, v33, v34
19937 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
19938 ; GFX8-NEXT: v_min_f32_e32 v30, v15, v30
19939 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
19940 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
19941 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
19942 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
19943 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
19944 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
19945 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
19946 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
19947 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19948 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
19949 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
19950 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
19951 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
19952 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
19953 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19954 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
19955 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
19956 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
19957 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
19958 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
19959 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
19960 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
19961 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
19962 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
19963 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
19964 ; GFX8-NEXT: v_min_f32_e32 v29, v33, v29
19965 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
19966 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
19967 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
19968 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
19969 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19970 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
19971 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
19972 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
19973 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
19974 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
19975 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
19976 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
19977 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
19978 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
19979 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
19980 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
19981 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
19982 ; GFX8-NEXT: v_min_f32_e32 v28, v33, v28
19983 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
19984 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
19985 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
19986 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
19987 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
19988 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
19989 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
19990 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
19991 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
19992 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
19993 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
19994 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
19995 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
19996 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
19997 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
19998 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
19999 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
20000 ; GFX8-NEXT: v_min_f32_e32 v27, v33, v27
20001 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
20002 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
20003 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
20004 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
20005 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20006 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
20007 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
20008 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
20009 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
20010 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
20011 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
20012 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
20013 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
20014 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
20015 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
20016 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
20017 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
20018 ; GFX8-NEXT: v_min_f32_e32 v26, v33, v26
20019 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
20020 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
20021 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20022 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20023 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20024 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
20025 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
20026 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
20027 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
20028 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
20029 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
20030 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
20031 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
20032 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
20033 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
20034 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
20035 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
20036 ; GFX8-NEXT: v_min_f32_e32 v25, v33, v25
20037 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
20038 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
20039 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
20040 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20041 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20042 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
20043 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
20044 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
20045 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
20046 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
20047 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
20048 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
20049 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
20050 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
20051 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
20052 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
20053 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
20054 ; GFX8-NEXT: v_min_f32_e32 v24, v33, v24
20055 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
20056 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
20057 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20058 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20059 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20060 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
20061 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
20062 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
20063 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
20064 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
20065 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
20066 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
20067 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
20068 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
20069 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
20070 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
20071 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
20072 ; GFX8-NEXT: v_min_f32_e32 v23, v33, v23
20073 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
20074 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
20075 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20076 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20077 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20078 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
20079 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
20080 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
20081 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
20082 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
20083 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
20084 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
20085 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
20086 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
20087 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
20088 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
20089 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
20090 ; GFX8-NEXT: v_min_f32_e32 v22, v33, v22
20091 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
20092 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
20093 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20094 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20095 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20096 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v21
20097 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
20098 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
20099 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
20100 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
20101 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
20102 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
20103 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
20104 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
20105 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
20106 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
20107 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
20108 ; GFX8-NEXT: v_min_f32_e32 v21, v33, v21
20109 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
20110 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
20111 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20112 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20113 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20114 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
20115 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
20116 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
20117 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
20118 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
20119 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
20120 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
20121 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
20122 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
20123 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
20124 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
20125 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
20126 ; GFX8-NEXT: v_min_f32_e32 v20, v33, v20
20127 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
20128 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
20129 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20130 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20131 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20132 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
20133 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
20134 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
20135 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
20136 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
20137 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
20138 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
20139 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
20140 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
20141 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
20142 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
20143 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
20144 ; GFX8-NEXT: v_min_f32_e32 v19, v33, v19
20145 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
20146 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
20147 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20148 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
20149 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20150 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
20151 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
20152 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
20153 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
20154 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
20155 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
20156 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
20157 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
20158 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
20159 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
20160 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
20161 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
20162 ; GFX8-NEXT: v_min_f32_e32 v18, v33, v18
20163 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
20164 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
20165 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20166 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20167 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20168 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
20169 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
20170 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
20171 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
20172 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
20173 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
20174 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
20175 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
20176 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
20177 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
20178 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
20179 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
20180 ; GFX8-NEXT: v_min_f32_e32 v17, v33, v17
20181 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
20182 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
20183 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20184 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20185 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
20186 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
20187 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
20188 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
20189 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
20190 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
20191 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
20192 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
20193 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
20194 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
20195 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
20196 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
20197 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
20198 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
20199 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
20200 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
20201 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
20202 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
20203 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
20204 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
20205 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
20206 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
20207 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
20208 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
20209 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
20210 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
20211 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
20212 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
20213 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
20214 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
20215 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
20216 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
20217 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
20218 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
20219 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
20220 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
20221 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
20222 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
20223 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
20224 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
20225 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
20226 ; GFX8-NEXT: s_setpc_b64 s[30:31]
20228 ; GFX9-LABEL: v_minnum_v32bf16:
20230 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20231 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
20232 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
20233 ; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
20234 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
20235 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
20236 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
20237 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
20238 ; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
20239 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
20240 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
20241 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
20242 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
20243 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
20244 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
20245 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
20246 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
20247 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
20248 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
20249 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
20250 ; GFX9-NEXT: v_min_f32_e32 v30, v32, v30
20251 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
20252 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
20253 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
20254 ; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
20255 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
20256 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
20257 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
20258 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
20259 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
20260 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
20261 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
20262 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
20263 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
20264 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
20265 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
20266 ; GFX9-NEXT: v_min_f32_e32 v32, v32, v29
20267 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
20268 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
20269 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
20270 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
20271 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
20272 ; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
20273 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
20274 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
20275 ; GFX9-NEXT: s_waitcnt vmcnt(0)
20276 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
20277 ; GFX9-NEXT: v_min_f32_e32 v33, v33, v34
20278 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
20279 ; GFX9-NEXT: v_min_f32_e32 v29, v15, v29
20280 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
20281 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
20282 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
20283 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
20284 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
20285 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
20286 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
20287 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
20288 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
20289 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
20290 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
20291 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
20292 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
20293 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
20294 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
20295 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
20296 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
20297 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
20298 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
20299 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
20300 ; GFX9-NEXT: v_min_f32_e32 v28, v33, v28
20301 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
20302 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
20303 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
20304 ; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
20305 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
20306 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
20307 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
20308 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
20309 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
20310 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
20311 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
20312 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
20313 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
20314 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
20315 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
20316 ; GFX9-NEXT: v_min_f32_e32 v27, v33, v27
20317 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
20318 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
20319 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
20320 ; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
20321 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
20322 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
20323 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
20324 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
20325 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
20326 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
20327 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
20328 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
20329 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
20330 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
20331 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
20332 ; GFX9-NEXT: v_min_f32_e32 v26, v33, v26
20333 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20334 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20335 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
20336 ; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
20337 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
20338 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
20339 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
20340 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
20341 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
20342 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
20343 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
20344 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
20345 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
20346 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
20347 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
20348 ; GFX9-NEXT: v_min_f32_e32 v25, v33, v25
20349 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
20350 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20351 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
20352 ; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
20353 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
20354 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
20355 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
20356 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
20357 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
20358 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
20359 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
20360 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
20361 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
20362 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
20363 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
20364 ; GFX9-NEXT: v_min_f32_e32 v24, v33, v24
20365 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20366 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20367 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
20368 ; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
20369 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
20370 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
20371 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
20372 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
20373 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
20374 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
20375 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
20376 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
20377 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
20378 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
20379 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
20380 ; GFX9-NEXT: v_min_f32_e32 v23, v33, v23
20381 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20382 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20383 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
20384 ; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
20385 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
20386 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
20387 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
20388 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
20389 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
20390 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
20391 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
20392 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
20393 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
20394 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
20395 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
20396 ; GFX9-NEXT: v_min_f32_e32 v22, v33, v22
20397 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20398 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20399 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
20400 ; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
20401 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
20402 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
20403 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
20404 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
20405 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
20406 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
20407 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
20408 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
20409 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
20410 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
20411 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
20412 ; GFX9-NEXT: v_min_f32_e32 v21, v33, v21
20413 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20414 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20415 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
20416 ; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
20417 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
20418 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
20419 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
20420 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
20421 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
20422 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
20423 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
20424 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
20425 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
20426 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
20427 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
20428 ; GFX9-NEXT: v_min_f32_e32 v20, v33, v20
20429 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20430 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20431 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
20432 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
20433 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
20434 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
20435 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
20436 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
20437 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
20438 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
20439 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
20440 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
20441 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
20442 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
20443 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
20444 ; GFX9-NEXT: v_min_f32_e32 v19, v33, v19
20445 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20446 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
20447 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
20448 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
20449 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
20450 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
20451 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
20452 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
20453 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
20454 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
20455 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
20456 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
20457 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
20458 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
20459 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
20460 ; GFX9-NEXT: v_min_f32_e32 v18, v33, v18
20461 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20462 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20463 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
20464 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
20465 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
20466 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
20467 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
20468 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
20469 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
20470 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
20471 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
20472 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
20473 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
20474 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
20475 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
20476 ; GFX9-NEXT: v_min_f32_e32 v17, v33, v17
20477 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20478 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20479 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
20480 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
20481 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
20482 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
20483 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
20484 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
20485 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
20486 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
20487 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
20488 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
20489 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
20490 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
20491 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
20492 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
20493 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
20494 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
20495 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
20496 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
20497 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
20498 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
20499 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
20500 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
20501 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
20502 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
20503 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
20504 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
20505 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
20506 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
20507 ; GFX9-NEXT: s_setpc_b64 s[30:31]
20509 ; GFX10-LABEL: v_minnum_v32bf16:
20511 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20512 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
20513 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
20514 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
20515 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
20516 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
20517 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
20518 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
20519 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
20520 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
20521 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
20522 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
20523 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
20524 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
20525 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
20526 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
20527 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20528 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20529 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
20530 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
20531 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
20532 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20533 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
20534 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
20535 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20536 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20537 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
20538 ; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
20539 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20540 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20541 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
20542 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
20543 ; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
20544 ; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
20545 ; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
20546 ; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
20547 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
20548 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
20549 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
20550 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
20551 ; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
20552 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
20553 ; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
20554 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
20555 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20556 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
20557 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
20558 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
20559 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20560 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20561 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
20562 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
20563 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20564 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20565 ; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
20566 ; GFX10-NEXT: v_min_f32_e32 v25, v54, v53
20567 ; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
20568 ; GFX10-NEXT: v_min_f32_e32 v24, v64, v55
20569 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
20570 ; GFX10-NEXT: v_min_f32_e32 v23, v66, v65
20571 ; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
20572 ; GFX10-NEXT: v_min_f32_e32 v22, v68, v67
20573 ; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
20574 ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
20575 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
20576 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
20577 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
20578 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
20579 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
20580 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
20581 ; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
20582 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
20583 ; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
20584 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
20585 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20586 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20587 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
20588 ; GFX10-NEXT: v_min_f32_e32 v18, v27, v48
20589 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
20590 ; GFX10-NEXT: v_min_f32_e32 v17, v26, v50
20591 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
20592 ; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
20593 ; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
20594 ; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
20595 ; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
20596 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
20597 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
20598 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
20599 ; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
20600 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
20601 ; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
20602 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
20603 ; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
20604 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20605 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20606 ; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
20607 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
20608 ; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
20609 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
20610 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20611 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20612 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
20613 ; GFX10-NEXT: v_min_f32_e32 v19, v28, v38
20614 ; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
20615 ; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
20616 ; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
20617 ; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
20618 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
20619 ; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
20620 ; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
20621 ; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
20622 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
20623 ; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
20624 ; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
20625 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
20626 ; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
20627 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
20628 ; GFX10-NEXT: v_min_f32_e32 v20, v29, v36
20629 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
20630 ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
20631 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
20632 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
20633 ; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
20634 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
20635 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
20636 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
20637 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
20638 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
20639 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
20640 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
20641 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
20642 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
20643 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
20644 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
20645 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
20646 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
20647 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
20648 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
20649 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
20650 ; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
20651 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
20652 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
20653 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
20654 ; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
20655 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
20656 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
20657 ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
20658 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
20659 ; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
20660 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
20661 ; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
20662 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
20663 ; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
20664 ; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
20665 ; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
20666 ; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
20667 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
20668 ; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
20669 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
20670 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
20671 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
20672 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
20673 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
20674 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
20675 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
20676 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
20677 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
20678 ; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
20679 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
20680 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
20681 ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
20682 ; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
20683 ; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
20684 ; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
20685 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
20686 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
20687 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
20688 ; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
20689 ; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
20690 ; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
20691 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
20692 ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
20693 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
20694 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
20695 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
20696 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
20697 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
20698 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
20699 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
20700 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
20701 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
20702 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
20703 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
20704 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
20705 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
20706 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
20707 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
20708 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
20709 ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
20710 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
20711 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
20712 ; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
20713 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
20714 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
20715 ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
20716 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
20717 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
20718 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
20719 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
20720 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
20721 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
20722 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
20723 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
20724 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
20725 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
20726 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
20727 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
20728 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
20729 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
20730 ; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
20731 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
20732 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
20733 ; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
20734 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
20735 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
20736 ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
20737 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
20738 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
20739 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
20740 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
20741 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
20742 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
20743 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
20744 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
20745 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
20746 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
20747 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
20748 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
20749 ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
20750 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
20751 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
20752 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
20753 ; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
20754 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
20755 ; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
20756 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
20757 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
20758 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
20759 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
20760 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
20761 ; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
20762 ; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
20763 ; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
20764 ; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
20765 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
20766 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
20767 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
20768 ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
20769 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
20770 ; GFX10-NEXT: s_waitcnt vmcnt(0)
20771 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
20772 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
20773 ; GFX10-NEXT: v_min_f32_e32 v17, v31, v17
20774 ; GFX10-NEXT: v_min_f32_e32 v15, v15, v18
20775 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
20776 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
20777 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
20778 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
20779 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
20780 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
20781 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
20782 ; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
20783 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
20784 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
20785 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
20786 ; GFX10-NEXT: s_setpc_b64 s[30:31]
20788 ; GFX11-LABEL: v_minnum_v32bf16:
20790 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20791 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
20792 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
20793 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
20794 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
20795 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
20796 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
20797 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
20798 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
20799 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
20800 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
20801 ; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
20802 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
20803 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20804 ; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
20805 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
20806 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
20807 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
20808 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
20809 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
20810 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
20811 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
20812 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
20813 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
20814 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
20815 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
20816 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
20817 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
20818 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20819 ; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
20820 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
20821 ; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
20822 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
20823 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
20824 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
20825 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20826 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
20827 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
20828 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
20829 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
20830 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
20831 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
20832 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
20833 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
20834 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
20835 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
20836 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
20837 ; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
20838 ; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
20839 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
20840 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
20841 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20842 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
20843 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
20844 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
20845 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
20846 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
20847 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
20848 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
20849 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
20850 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
20851 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
20852 ; GFX11-NEXT: v_min_f32_e32 v2, v2, v18
20853 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v16
20854 ; GFX11-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
20855 ; GFX11-NEXT: v_min_f32_e32 v7, v7, v23
20856 ; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
20857 ; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
20858 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
20859 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
20860 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
20861 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
20862 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
20863 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
20864 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
20865 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
20866 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
20867 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
20868 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
20869 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
20870 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
20871 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
20872 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
20873 ; GFX11-NEXT: v_min_f32_e32 v4, v4, v20
20874 ; GFX11-NEXT: v_min_f32_e32 v20, v80, v71
20875 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
20876 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
20877 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
20878 ; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
20879 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
20880 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
20881 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
20882 ; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
20883 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
20884 ; GFX11-NEXT: v_min_f32_e32 v26, v52, v51
20885 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20886 ; GFX11-NEXT: v_min_f32_e32 v6, v6, v22
20887 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
20888 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
20889 ; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
20890 ; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
20891 ; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
20892 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
20893 ; GFX11-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
20894 ; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
20895 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
20896 ; GFX11-NEXT: v_min_f32_e32 v29, v38, v37
20897 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
20898 ; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
20899 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
20900 ; GFX11-NEXT: v_min_f32_e32 v14, v14, v30
20901 ; GFX11-NEXT: v_min_f32_e32 v28, v48, v39
20902 ; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
20903 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
20904 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
20905 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
20906 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
20907 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
20908 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
20909 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
20910 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
20911 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
20912 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
20913 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
20914 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
20915 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
20916 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
20917 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
20918 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
20919 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
20920 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
20921 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
20922 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
20923 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
20924 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
20925 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
20926 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
20927 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
20928 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
20929 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
20930 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
20931 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
20932 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
20933 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
20934 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
20935 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
20936 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
20937 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
20938 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
20939 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
20940 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
20941 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
20942 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
20943 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
20944 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
20945 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
20946 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
20947 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
20948 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
20949 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
20950 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
20951 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
20952 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
20953 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
20954 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
20955 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
20956 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
20957 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
20958 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
20959 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
20960 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
20961 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
20962 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
20963 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
20964 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
20965 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
20966 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
20967 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
20968 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
20969 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
20970 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
20971 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
20972 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
20973 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
20974 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
20975 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
20976 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
20977 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
20978 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
20979 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
20980 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
20981 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
20982 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
20983 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
20984 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
20985 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
20986 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
20987 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
20988 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
20989 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
20990 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
20991 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
20992 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
20993 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
20994 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
20995 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
20996 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
20997 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
20998 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
20999 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
21000 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
21001 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
21002 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
21003 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
21004 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
21005 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
21006 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
21007 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21008 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
21009 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
21010 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
21011 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
21012 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
21013 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
21014 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
21015 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
21016 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
21017 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
21018 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21019 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
21020 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
21021 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21022 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
21023 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
21024 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
21025 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
21026 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
21027 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
21028 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
21029 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
21030 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21031 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
21032 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
21033 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
21034 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
21035 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
21036 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21037 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
21038 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
21039 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
21040 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
21041 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
21042 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
21043 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
21044 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
21045 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
21046 ; GFX11-NEXT: s_waitcnt vmcnt(0)
21047 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
21048 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21049 ; GFX11-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
21050 ; GFX11-NEXT: v_min_f32_e32 v15, v15, v18
21051 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
21052 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
21053 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
21054 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
21055 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
21056 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
21057 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
21058 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
21059 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
21060 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
21061 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
21062 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
21063 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21064 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
21065 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21066 %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
21067 ret <32 x bfloat> %op
21071 declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
21072 declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
21073 declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
21074 declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
21075 declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
21076 declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
21077 declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
21079 define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
21080 ; GCN-LABEL: v_maxnum_bf16:
21082 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21083 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21084 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21085 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21086 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21087 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1
21088 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21089 ; GCN-NEXT: s_setpc_b64 s[30:31]
21091 ; GFX7-LABEL: v_maxnum_bf16:
21093 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21094 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21095 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21096 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21097 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21098 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
21099 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21100 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21102 ; GFX8-LABEL: v_maxnum_bf16:
21104 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21105 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21106 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21107 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
21108 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
21109 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
21110 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
21111 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
21112 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21113 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
21114 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21115 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21117 ; GFX9-LABEL: v_maxnum_bf16:
21119 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21120 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21121 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21122 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
21123 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
21124 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21125 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
21126 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
21127 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21128 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
21129 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21130 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21132 ; GFX10-LABEL: v_maxnum_bf16:
21134 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21135 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21136 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21137 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
21138 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
21139 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
21140 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21141 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
21142 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
21143 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21144 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21146 ; GFX11-LABEL: v_maxnum_bf16:
21148 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21149 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21150 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
21151 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21152 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
21153 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
21154 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
21155 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21156 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
21157 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
21158 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
21159 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21160 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21161 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21162 %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
21166 define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
21167 ; GCN-LABEL: v_maxnum_v2bf16:
21169 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21170 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21171 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21172 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21173 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21174 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21175 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21176 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21177 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21178 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3
21179 ; GCN-NEXT: v_max_f32_e32 v0, v0, v2
21180 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21181 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21182 ; GCN-NEXT: s_setpc_b64 s[30:31]
21184 ; GFX7-LABEL: v_maxnum_v2bf16:
21186 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21187 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21188 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21189 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21190 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21191 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21192 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21193 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21194 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21195 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
21196 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
21197 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21198 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21199 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21201 ; GFX8-LABEL: v_maxnum_v2bf16:
21203 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21204 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21205 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21206 ; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
21207 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
21208 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
21209 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21210 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21211 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
21212 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
21213 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
21214 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21215 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
21216 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
21217 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
21218 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
21219 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
21220 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21221 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
21222 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21223 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
21224 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21226 ; GFX9-LABEL: v_maxnum_v2bf16:
21228 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21229 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21230 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21231 ; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
21232 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21233 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21234 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
21235 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21236 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
21237 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
21238 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
21239 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21240 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
21241 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
21242 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
21243 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
21244 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21245 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
21246 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
21247 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
21248 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21250 ; GFX10-LABEL: v_maxnum_v2bf16:
21252 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21253 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21254 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21255 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21256 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21257 ; GFX10-NEXT: v_max_f32_e32 v2, v3, v2
21258 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
21259 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
21260 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
21261 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
21262 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
21263 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
21264 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
21265 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
21266 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
21267 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21268 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
21269 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
21270 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21272 ; GFX11-LABEL: v_maxnum_v2bf16:
21274 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21275 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
21276 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21277 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
21278 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21279 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
21280 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
21281 ; GFX11-NEXT: v_max_f32_e32 v2, v3, v2
21282 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
21283 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
21284 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
21285 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
21286 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
21287 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
21288 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
21289 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
21290 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
21291 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
21292 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21293 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
21294 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21295 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
21296 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21297 %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
21298 ret <2 x bfloat> %op
21301 define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
21302 ; GCN-LABEL: v_maxnum_v3bf16:
21304 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21305 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21306 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21307 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21308 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
21309 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21310 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
21311 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21312 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21313 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21314 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21315 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21316 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21317 ; GCN-NEXT: v_max_f32_e32 v2, v2, v5
21318 ; GCN-NEXT: v_max_f32_e32 v1, v1, v4
21319 ; GCN-NEXT: v_max_f32_e32 v0, v0, v3
21320 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21321 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21322 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21323 ; GCN-NEXT: s_setpc_b64 s[30:31]
21325 ; GFX7-LABEL: v_maxnum_v3bf16:
21327 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21328 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21329 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21330 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21331 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
21332 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21333 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
21334 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21335 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21336 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21337 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21338 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21339 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21340 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
21341 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
21342 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
21343 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21344 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21345 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21346 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21348 ; GFX8-LABEL: v_maxnum_v3bf16:
21350 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21351 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21352 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21353 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
21354 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
21355 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
21356 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
21357 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
21358 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21359 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
21360 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21361 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
21362 ; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
21363 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
21364 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
21365 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
21366 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21367 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21368 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
21369 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
21370 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
21371 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21372 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
21373 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
21374 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
21375 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
21376 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
21377 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21378 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
21379 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21380 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
21381 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
21382 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21384 ; GFX9-LABEL: v_maxnum_v3bf16:
21386 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21387 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21388 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21389 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
21390 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
21391 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21392 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
21393 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
21394 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21395 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
21396 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21397 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
21398 ; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
21399 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21400 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21401 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
21402 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
21403 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
21404 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
21405 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21406 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
21407 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
21408 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
21409 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
21410 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21411 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
21412 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
21413 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
21414 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
21415 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21417 ; GFX10-LABEL: v_maxnum_v3bf16:
21419 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21420 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
21421 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21422 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21423 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21424 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21425 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
21426 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
21427 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
21428 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
21429 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
21430 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
21431 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
21432 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21433 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
21434 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
21435 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
21436 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
21437 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
21438 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
21439 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21440 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21441 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21442 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21443 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
21444 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21445 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
21446 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21448 ; GFX11TRUE16-LABEL: v_maxnum_v3bf16:
21449 ; GFX11TRUE16: ; %bb.0:
21450 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21451 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
21452 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21453 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21454 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21455 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21456 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
21457 ; GFX11TRUE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
21458 ; GFX11TRUE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
21459 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
21460 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
21461 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
21462 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21463 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
21464 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
21465 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
21466 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
21467 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
21468 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
21469 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
21470 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21471 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21472 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21473 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21474 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21475 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
21476 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21477 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
21478 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
21479 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
21481 ; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
21482 ; GFX11FAKE16: ; %bb.0:
21483 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21484 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
21485 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21486 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
21487 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21488 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21489 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
21490 ; GFX11FAKE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
21491 ; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
21492 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
21493 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
21494 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
21495 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21496 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
21497 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
21498 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
21499 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
21500 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
21501 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
21502 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
21503 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
21504 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21505 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21506 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
21507 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21508 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
21509 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
21510 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
21511 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
21512 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
21513 %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
21514 ret <3 x bfloat> %op
21517 define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
21518 ; GCN-LABEL: v_maxnum_v4bf16:
21520 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21521 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21522 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
21523 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21524 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
21525 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21526 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
21527 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21528 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
21529 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21530 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21531 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21532 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21533 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21534 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21535 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21536 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21537 ; GCN-NEXT: v_max_f32_e32 v3, v3, v7
21538 ; GCN-NEXT: v_max_f32_e32 v2, v2, v6
21539 ; GCN-NEXT: v_max_f32_e32 v1, v1, v5
21540 ; GCN-NEXT: v_max_f32_e32 v0, v0, v4
21541 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21542 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21543 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21544 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21545 ; GCN-NEXT: s_setpc_b64 s[30:31]
21547 ; GFX7-LABEL: v_maxnum_v4bf16:
21549 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21550 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21551 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
21552 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21553 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
21554 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21555 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
21556 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21557 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
21558 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21559 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21560 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21561 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21562 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21563 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21564 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21565 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21566 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
21567 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
21568 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
21569 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
21570 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21571 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21572 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21573 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21574 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21576 ; GFX8-LABEL: v_maxnum_v4bf16:
21578 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21579 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21580 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21581 ; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
21582 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
21583 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
21584 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21585 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21586 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
21587 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
21588 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
21589 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
21590 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
21591 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
21592 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
21593 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
21594 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
21595 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
21596 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21597 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
21598 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21599 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21600 ; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
21601 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
21602 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
21603 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21604 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21605 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
21606 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
21607 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
21608 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21609 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
21610 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
21611 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
21612 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
21613 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
21614 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21615 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
21616 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
21617 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21618 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
21619 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
21620 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21622 ; GFX9-LABEL: v_maxnum_v4bf16:
21624 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21625 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21626 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21627 ; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
21628 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21629 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21630 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
21631 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21632 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
21633 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
21634 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
21635 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
21636 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
21637 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
21638 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
21639 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
21640 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21641 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
21642 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
21643 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
21644 ; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
21645 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21646 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21647 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
21648 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
21649 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
21650 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
21651 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21652 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
21653 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
21654 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
21655 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
21656 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21657 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
21658 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
21659 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
21660 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
21661 ; GFX9-NEXT: s_setpc_b64 s[30:31]
21663 ; GFX10-LABEL: v_maxnum_v4bf16:
21665 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21666 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21667 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21668 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21669 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21670 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
21671 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
21672 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
21673 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21674 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21675 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
21676 ; GFX10-NEXT: v_max_f32_e32 v3, v7, v6
21677 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
21678 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
21679 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
21680 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21681 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
21682 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
21683 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
21684 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
21685 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
21686 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
21687 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
21688 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
21689 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
21690 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
21691 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
21692 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
21693 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
21694 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21695 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
21696 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21697 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
21698 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
21699 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
21700 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21702 ; GFX11-LABEL: v_maxnum_v4bf16:
21704 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21705 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
21706 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0
21707 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21708 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21709 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
21710 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
21711 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21712 ; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
21713 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21714 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
21715 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21716 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
21717 ; GFX11-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
21718 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
21719 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
21720 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
21721 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
21722 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
21723 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
21724 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
21725 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
21726 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
21727 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
21728 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
21729 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
21730 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
21731 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
21732 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
21733 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
21734 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
21735 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
21736 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
21737 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
21738 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
21739 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
21740 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
21741 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21742 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
21743 ; GFX11-NEXT: s_setpc_b64 s[30:31]
21744 %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
21745 ret <4 x bfloat> %op
21748 define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
21749 ; GCN-LABEL: v_maxnum_v8bf16:
21751 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21752 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
21753 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
21754 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
21755 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
21756 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
21757 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
21758 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
21759 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
21760 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
21761 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
21762 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
21763 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
21764 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
21765 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
21766 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
21767 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
21768 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
21769 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21770 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
21771 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21772 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
21773 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21774 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
21775 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21776 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
21777 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21778 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
21779 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21780 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
21781 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21782 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
21783 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21784 ; GCN-NEXT: v_max_f32_e32 v7, v7, v15
21785 ; GCN-NEXT: v_max_f32_e32 v6, v6, v14
21786 ; GCN-NEXT: v_max_f32_e32 v5, v5, v13
21787 ; GCN-NEXT: v_max_f32_e32 v4, v4, v12
21788 ; GCN-NEXT: v_max_f32_e32 v3, v3, v11
21789 ; GCN-NEXT: v_max_f32_e32 v2, v2, v10
21790 ; GCN-NEXT: v_max_f32_e32 v1, v1, v9
21791 ; GCN-NEXT: v_max_f32_e32 v0, v0, v8
21792 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21793 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21794 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21795 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21796 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21797 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21798 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21799 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21800 ; GCN-NEXT: s_setpc_b64 s[30:31]
21802 ; GFX7-LABEL: v_maxnum_v8bf16:
21804 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21805 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
21806 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
21807 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
21808 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
21809 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
21810 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
21811 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
21812 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
21813 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
21814 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
21815 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
21816 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
21817 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
21818 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
21819 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
21820 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
21821 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
21822 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21823 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
21824 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21825 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
21826 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21827 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
21828 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21829 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
21830 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21831 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
21832 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21833 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
21834 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21835 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
21836 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21837 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
21838 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
21839 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
21840 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v12
21841 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v11
21842 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v10
21843 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
21844 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
21845 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21846 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21847 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21848 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21849 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21850 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21851 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21852 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21853 ; GFX7-NEXT: s_setpc_b64 s[30:31]
21855 ; GFX8-LABEL: v_maxnum_v8bf16:
21857 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21858 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
21859 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
21860 ; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
21861 ; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
21862 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
21863 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21864 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21865 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
21866 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
21867 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
21868 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
21869 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
21870 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
21871 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
21872 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
21873 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
21874 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
21875 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21876 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
21877 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
21878 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
21879 ; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
21880 ; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
21881 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
21882 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21883 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21884 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
21885 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v6
21886 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
21887 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
21888 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
21889 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
21890 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
21891 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
21892 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
21893 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21894 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
21895 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
21896 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
21897 ; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
21898 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
21899 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
21900 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21901 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21902 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
21903 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v5
21904 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
21905 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
21906 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
21907 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
21908 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
21909 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
21910 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
21911 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21912 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
21913 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
21914 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
21915 ; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
21916 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
21917 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
21918 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21919 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21920 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
21921 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v4
21922 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
21923 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
21924 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
21925 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
21926 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
21927 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
21928 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
21929 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
21930 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
21931 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
21932 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
21933 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
21934 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
21935 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
21936 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
21937 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
21938 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
21939 ; GFX8-NEXT: s_setpc_b64 s[30:31]
21941 ; GFX9-LABEL: v_maxnum_v8bf16:
21943 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21944 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
21945 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
21946 ; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
21947 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
21948 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
21949 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
21950 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
21951 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
21952 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
21953 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
21954 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
21955 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
21956 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
21957 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
21958 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
21959 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
21960 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
21961 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
21962 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
21963 ; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
21964 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
21965 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
21966 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
21967 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
21968 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
21969 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
21970 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
21971 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
21972 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
21973 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
21974 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
21975 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
21976 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
21977 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
21978 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
21979 ; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
21980 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
21981 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
21982 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
21983 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
21984 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
21985 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
21986 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
21987 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
21988 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
21989 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
21990 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
21991 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
21992 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
21993 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
21994 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
21995 ; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
21996 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
21997 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
21998 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
21999 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
22000 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
22001 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
22002 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
22003 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
22004 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
22005 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
22006 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
22007 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
22008 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
22009 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
22010 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
22011 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
22012 ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
22013 ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
22014 ; GFX9-NEXT: s_setpc_b64 s[30:31]
22016 ; GFX10-LABEL: v_maxnum_v8bf16:
22018 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22019 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
22020 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
22021 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22022 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22023 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
22024 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22025 ; GFX10-NEXT: v_max_f32_e32 v8, v9, v8
22026 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
22027 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22028 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
22029 ; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
22030 ; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
22031 ; GFX10-NEXT: v_max_f32_e32 v7, v10, v9
22032 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
22033 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
22034 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
22035 ; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
22036 ; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
22037 ; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
22038 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
22039 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
22040 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
22041 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
22042 ; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
22043 ; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
22044 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
22045 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22046 ; GFX10-NEXT: v_max_f32_e32 v6, v10, v6
22047 ; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
22048 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22049 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22050 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
22051 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
22052 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
22053 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
22054 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22055 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22056 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22057 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v5
22058 ; GFX10-NEXT: v_max_f32_e32 v5, v15, v13
22059 ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
22060 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
22061 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
22062 ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
22063 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
22064 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
22065 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
22066 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22067 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
22068 ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
22069 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
22070 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
22071 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
22072 ; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
22073 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22074 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
22075 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
22076 ; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
22077 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
22078 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22079 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
22080 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22081 ; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
22082 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
22083 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22084 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
22085 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
22086 ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
22087 ; GFX10-NEXT: s_setpc_b64 s[30:31]
22089 ; GFX11-LABEL: v_maxnum_v8bf16:
22091 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22092 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2
22093 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7
22094 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22095 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3
22096 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0
22097 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
22098 ; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
22099 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6
22100 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
22101 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22102 ; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1
22103 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22104 ; GFX11-NEXT: v_max_f32_e32 v3, v3, v7
22105 ; GFX11-NEXT: v_max_f32_e32 v7, v10, v9
22106 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8
22107 ; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
22108 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22109 ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
22110 ; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
22111 ; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
22112 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
22113 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
22114 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22115 ; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
22116 ; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
22117 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
22118 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22119 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1
22120 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22121 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
22122 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
22123 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
22124 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22125 ; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1
22126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22127 ; GFX11-NEXT: v_max_f32_e32 v6, v10, v6
22128 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2
22129 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22130 ; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
22131 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
22132 ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1
22133 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
22134 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
22135 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22136 ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4
22137 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22138 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22139 ; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
22140 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
22141 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v4
22142 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
22143 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
22144 ; GFX11-NEXT: v_max_f32_e32 v5, v15, v13
22145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22146 ; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1
22147 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1
22148 ; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1
22149 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
22150 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22151 ; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
22152 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
22153 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22154 ; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
22155 ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
22156 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0
22157 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
22158 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
22159 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22160 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
22161 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22162 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22163 ; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
22164 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
22165 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22166 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
22167 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
22168 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22169 ; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
22170 ; GFX11-NEXT: s_setpc_b64 s[30:31]
22171 %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
22172 ret <8 x bfloat> %op
22175 define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
22176 ; GCN-LABEL: v_maxnum_v16bf16:
22178 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22179 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
22180 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
22181 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
22182 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22183 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30
22184 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
22185 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
22186 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
22187 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22188 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29
22189 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
22190 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
22191 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
22192 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22193 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28
22194 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
22195 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
22196 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
22197 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22198 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27
22199 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
22200 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
22201 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
22202 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22203 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26
22204 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
22205 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
22206 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
22207 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22208 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25
22209 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
22210 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
22211 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
22212 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22213 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24
22214 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
22215 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
22216 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
22217 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22218 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23
22219 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
22220 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
22221 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
22222 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22223 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22
22224 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
22225 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
22226 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
22227 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22228 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21
22229 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
22230 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
22231 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
22232 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
22233 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
22234 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
22235 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
22236 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
22237 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
22238 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
22239 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
22240 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
22241 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22242 ; GCN-NEXT: v_max_f32_e32 v4, v4, v20
22243 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
22244 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22245 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
22246 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22247 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
22248 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22249 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
22250 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22251 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
22252 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22253 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19
22254 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18
22255 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17
22256 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16
22257 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22258 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22259 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22260 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22261 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22262 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22263 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22264 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22265 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22266 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22267 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22268 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22269 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22270 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22271 ; GCN-NEXT: s_waitcnt vmcnt(0)
22272 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
22273 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
22274 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16
22275 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22276 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22277 ; GCN-NEXT: s_setpc_b64 s[30:31]
22279 ; GFX7-LABEL: v_maxnum_v16bf16:
22281 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22282 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
22283 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
22284 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
22285 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22286 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
22287 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
22288 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
22289 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
22290 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
22291 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
22292 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
22293 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
22294 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
22295 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
22296 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
22297 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
22298 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
22299 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
22300 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
22301 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
22302 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
22303 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
22304 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
22305 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
22306 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
22307 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
22308 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
22309 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
22310 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
22311 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
22312 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
22313 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
22314 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
22315 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
22316 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
22317 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
22318 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22319 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
22320 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22321 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
22322 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22323 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
22324 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22325 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
22326 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22327 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
22328 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22329 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
22330 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22331 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
22332 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22333 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22334 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
22335 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22336 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
22337 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22338 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
22339 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22340 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
22341 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22342 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
22343 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22344 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
22345 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22346 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
22347 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
22348 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
22349 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
22350 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
22351 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
22352 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
22353 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
22354 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
22355 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
22356 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
22357 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
22358 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
22359 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
22360 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22361 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22362 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22363 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22364 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22365 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22366 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22367 ; GFX7-NEXT: s_waitcnt vmcnt(0)
22368 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
22369 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
22370 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
22371 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22372 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22373 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22374 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22375 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22376 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22377 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22378 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22379 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22380 ; GFX7-NEXT: s_setpc_b64 s[30:31]
22382 ; GFX8-LABEL: v_maxnum_v16bf16:
22384 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22385 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22386 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22387 ; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
22388 ; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
22389 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
22390 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
22391 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22392 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22393 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22394 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v15
22395 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
22396 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
22397 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
22398 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
22399 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
22400 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
22401 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
22402 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
22403 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
22404 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
22405 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
22406 ; GFX8-NEXT: v_max_f32_e32 v15, v17, v15
22407 ; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
22408 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
22409 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22410 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22411 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22412 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v14
22413 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
22414 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
22415 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
22416 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
22417 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
22418 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
22419 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
22420 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
22421 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
22422 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
22423 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
22424 ; GFX8-NEXT: v_max_f32_e32 v14, v17, v14
22425 ; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
22426 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
22427 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22428 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22429 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22430 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v13
22431 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
22432 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
22433 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
22434 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
22435 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
22436 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
22437 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
22438 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
22439 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
22440 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
22441 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
22442 ; GFX8-NEXT: v_max_f32_e32 v13, v17, v13
22443 ; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
22444 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
22445 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22446 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22447 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22448 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v12
22449 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
22450 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
22451 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
22452 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
22453 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
22454 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
22455 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
22456 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
22457 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
22458 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22459 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
22460 ; GFX8-NEXT: v_max_f32_e32 v12, v17, v12
22461 ; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
22462 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
22463 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22464 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22465 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22466 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v11
22467 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
22468 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
22469 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
22470 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
22471 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
22472 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
22473 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
22474 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
22475 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
22476 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
22477 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
22478 ; GFX8-NEXT: v_max_f32_e32 v11, v17, v11
22479 ; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
22480 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
22481 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22482 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22483 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22484 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v10
22485 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
22486 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
22487 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
22488 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
22489 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
22490 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
22491 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
22492 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
22493 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
22494 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
22495 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
22496 ; GFX8-NEXT: v_max_f32_e32 v10, v17, v10
22497 ; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
22498 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
22499 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22500 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22501 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22502 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v9
22503 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
22504 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
22505 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
22506 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
22507 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
22508 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
22509 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
22510 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
22511 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
22512 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
22513 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
22514 ; GFX8-NEXT: v_max_f32_e32 v9, v17, v9
22515 ; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
22516 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
22517 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22518 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22519 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
22520 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v8
22521 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
22522 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
22523 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
22524 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
22525 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
22526 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
22527 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
22528 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
22529 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
22530 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
22531 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
22532 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
22533 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
22534 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
22535 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
22536 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
22537 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
22538 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
22539 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
22540 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
22541 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
22542 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
22543 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
22544 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
22545 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
22546 ; GFX8-NEXT: s_setpc_b64 s[30:31]
22548 ; GFX9-LABEL: v_maxnum_v16bf16:
22550 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22551 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22552 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22553 ; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
22554 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22555 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22556 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
22557 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
22558 ; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
22559 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
22560 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
22561 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
22562 ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
22563 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
22564 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
22565 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
22566 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
22567 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
22568 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
22569 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
22570 ; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
22571 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22572 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22573 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
22574 ; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
22575 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
22576 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
22577 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
22578 ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
22579 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
22580 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
22581 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
22582 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
22583 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
22584 ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
22585 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
22586 ; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
22587 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22588 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22589 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
22590 ; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
22591 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
22592 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
22593 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
22594 ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
22595 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
22596 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
22597 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
22598 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
22599 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
22600 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
22601 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
22602 ; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
22603 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22604 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22605 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
22606 ; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
22607 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
22608 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
22609 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
22610 ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
22611 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
22612 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
22613 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
22614 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
22615 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
22616 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22617 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
22618 ; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
22619 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22620 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22621 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
22622 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
22623 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
22624 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
22625 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
22626 ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
22627 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
22628 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
22629 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
22630 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
22631 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
22632 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
22633 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
22634 ; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
22635 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22636 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22637 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
22638 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
22639 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
22640 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
22641 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
22642 ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
22643 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
22644 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
22645 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
22646 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
22647 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
22648 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
22649 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
22650 ; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
22651 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22652 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22653 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
22654 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
22655 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
22656 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
22657 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
22658 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
22659 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
22660 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
22661 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
22662 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
22663 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
22664 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
22665 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
22666 ; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
22667 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22668 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22669 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
22670 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
22671 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
22672 ; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
22673 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
22674 ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
22675 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
22676 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
22677 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
22678 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
22679 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
22680 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
22681 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
22682 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
22683 ; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
22684 ; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
22685 ; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
22686 ; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
22687 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
22688 ; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
22689 ; GFX9-NEXT: s_setpc_b64 s[30:31]
22691 ; GFX10-LABEL: v_maxnum_v16bf16:
22693 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22694 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22695 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22696 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
22697 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22698 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
22699 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22700 ; GFX10-NEXT: v_max_f32_e32 v16, v17, v16
22701 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
22702 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
22703 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22704 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
22705 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
22706 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
22707 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
22708 ; GFX10-NEXT: v_max_f32_e32 v17, v18, v17
22709 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
22710 ; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
22711 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
22712 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
22713 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
22714 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
22715 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22716 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
22717 ; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
22718 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
22719 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22720 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
22721 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
22722 ; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
22723 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22724 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22725 ; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
22726 ; GFX10-NEXT: v_max_f32_e32 v17, v20, v19
22727 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
22728 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v13
22729 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
22730 ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
22731 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
22732 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
22733 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
22734 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22735 ; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
22736 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22737 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22738 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
22739 ; GFX10-NEXT: v_max_f32_e32 v13, v19, v18
22740 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
22741 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
22742 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22743 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
22744 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
22745 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
22746 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
22747 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
22748 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22749 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22750 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
22751 ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
22752 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22753 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
22754 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
22755 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22756 ; GFX10-NEXT: v_max_f32_e32 v12, v18, v12
22757 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
22758 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
22759 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
22760 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
22761 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
22762 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
22763 ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
22764 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
22765 ; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
22766 ; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
22767 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22768 ; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
22769 ; GFX10-NEXT: v_max_f32_e32 v18, v19, v18
22770 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22771 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
22772 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
22773 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
22774 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
22775 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
22776 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
22777 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22778 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
22779 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
22780 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
22781 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
22782 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
22783 ; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
22784 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
22785 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
22786 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22787 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
22788 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
22789 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
22790 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
22791 ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
22792 ; GFX10-NEXT: v_max_f32_e32 v19, v22, v20
22793 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
22794 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
22795 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22796 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22797 ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
22798 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
22799 ; GFX10-NEXT: v_max_f32_e32 v9, v22, v20
22800 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
22801 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v8
22802 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
22803 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
22804 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
22805 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
22806 ; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
22807 ; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
22808 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
22809 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
22810 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
22811 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22812 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
22813 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
22814 ; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
22815 ; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
22816 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
22817 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
22818 ; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
22819 ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
22820 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
22821 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22822 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
22823 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22824 ; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
22825 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
22826 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
22827 ; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
22828 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
22829 ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
22830 ; GFX10-NEXT: s_setpc_b64 s[30:31]
22832 ; GFX11-LABEL: v_maxnum_v16bf16:
22834 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22835 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6
22836 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15
22837 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
22838 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7
22839 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
22840 ; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
22841 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14
22842 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
22843 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16
22844 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
22845 ; GFX11-NEXT: v_max_f32_e32 v17, v18, v17
22846 ; GFX11-NEXT: v_max_f32_e32 v6, v6, v14
22847 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
22848 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
22849 ; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
22850 ; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
22851 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
22852 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
22853 ; GFX11-NEXT: v_max_f32_e32 v7, v7, v15
22854 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1
22855 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
22856 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17
22857 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22858 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
22859 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
22860 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
22861 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
22862 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7
22863 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
22864 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
22865 ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1
22866 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13
22867 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22868 ; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
22869 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
22870 ; GFX11-NEXT: v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
22871 ; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
22872 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
22873 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4
22874 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
22875 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
22876 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
22877 ; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
22878 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
22879 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
22880 ; GFX11-NEXT: v_max_f32_e32 v4, v4, v12
22881 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11
22882 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
22883 ; GFX11-NEXT: v_max_f32_e32 v5, v5, v13
22884 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6
22885 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
22886 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
22887 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
22888 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17
22889 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
22890 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
22891 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
22892 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
22893 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3
22894 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1
22895 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5
22896 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
22897 ; GFX11-NEXT: v_max_f32_e32 v12, v18, v12
22898 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22899 ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
22900 ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1
22901 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10
22902 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12
22903 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22904 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
22905 ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
22906 ; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13
22907 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
22908 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1
22909 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4
22910 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
22911 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
22912 ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1
22913 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
22914 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2
22915 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
22916 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
22917 ; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
22918 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
22919 ; GFX11-NEXT: v_max_f32_e32 v18, v19, v18
22920 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
22921 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1
22922 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
22923 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
22924 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1
22925 ; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18
22926 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
22927 ; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
22928 ; GFX11-NEXT: v_max_f32_e32 v3, v3, v11
22929 ; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
22930 ; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
22931 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
22932 ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1
22933 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
22934 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
22935 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3
22936 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22937 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
22938 ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1
22939 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9
22940 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
22941 ; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2
22942 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
22943 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
22944 ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
22945 ; GFX11-NEXT: v_max_f32_e32 v19, v22, v20
22946 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8
22947 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0
22948 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
22949 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
22950 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
22951 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
22952 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
22953 ; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1
22954 ; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9
22955 ; GFX11-NEXT: v_max_f32_e32 v9, v22, v20
22956 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
22957 ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
22958 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
22959 ; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0
22960 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
22961 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1
22962 ; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9
22963 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
22964 ; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1
22965 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
22966 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
22967 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1
22968 ; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
22969 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
22970 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
22971 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
22972 ; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
22973 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
22974 ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
22975 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
22976 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
22977 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
22978 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
22979 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
22980 ; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
22981 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
22982 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
22983 ; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
22984 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
22985 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
22986 ; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
22987 ; GFX11-NEXT: s_setpc_b64 s[30:31]
22988 %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
22989 ret <16 x bfloat> %op
22992 define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
22993 ; GCN-LABEL: v_maxnum_v32bf16:
22995 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22996 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
22997 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
22998 ; GCN-NEXT: s_waitcnt vmcnt(1)
22999 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
23000 ; GCN-NEXT: s_waitcnt vmcnt(0)
23001 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
23002 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23003 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23004 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
23005 ; GCN-NEXT: v_max_f32_e32 v31, v31, v32
23006 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
23007 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23008 ; GCN-NEXT: s_waitcnt vmcnt(0)
23009 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23010 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23011 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
23012 ; GCN-NEXT: v_max_f32_e32 v30, v30, v32
23013 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
23014 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23015 ; GCN-NEXT: s_waitcnt vmcnt(0)
23016 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23017 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23018 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
23019 ; GCN-NEXT: v_max_f32_e32 v29, v29, v32
23020 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
23021 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23022 ; GCN-NEXT: s_waitcnt vmcnt(0)
23023 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23024 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23025 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
23026 ; GCN-NEXT: v_max_f32_e32 v28, v28, v32
23027 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
23028 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23029 ; GCN-NEXT: s_waitcnt vmcnt(0)
23030 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23031 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23032 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
23033 ; GCN-NEXT: v_max_f32_e32 v27, v27, v32
23034 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
23035 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23036 ; GCN-NEXT: s_waitcnt vmcnt(0)
23037 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23038 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23039 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
23040 ; GCN-NEXT: v_max_f32_e32 v26, v26, v32
23041 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
23042 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23043 ; GCN-NEXT: s_waitcnt vmcnt(0)
23044 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23045 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23046 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
23047 ; GCN-NEXT: v_max_f32_e32 v25, v25, v32
23048 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
23049 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23050 ; GCN-NEXT: s_waitcnt vmcnt(0)
23051 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23052 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23053 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
23054 ; GCN-NEXT: v_max_f32_e32 v24, v24, v32
23055 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
23056 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23057 ; GCN-NEXT: s_waitcnt vmcnt(0)
23058 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23059 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23060 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
23061 ; GCN-NEXT: v_max_f32_e32 v23, v23, v32
23062 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
23063 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23064 ; GCN-NEXT: s_waitcnt vmcnt(0)
23065 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23066 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23067 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
23068 ; GCN-NEXT: v_max_f32_e32 v22, v22, v32
23069 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
23070 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23071 ; GCN-NEXT: s_waitcnt vmcnt(0)
23072 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23073 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23074 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
23075 ; GCN-NEXT: v_max_f32_e32 v21, v21, v32
23076 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
23077 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23078 ; GCN-NEXT: s_waitcnt vmcnt(0)
23079 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23080 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23081 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
23082 ; GCN-NEXT: v_max_f32_e32 v20, v20, v32
23083 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
23084 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23085 ; GCN-NEXT: s_waitcnt vmcnt(0)
23086 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23087 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23088 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
23089 ; GCN-NEXT: v_max_f32_e32 v19, v19, v32
23090 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
23091 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23092 ; GCN-NEXT: s_waitcnt vmcnt(0)
23093 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23094 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23095 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
23096 ; GCN-NEXT: v_max_f32_e32 v18, v18, v32
23097 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
23098 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23099 ; GCN-NEXT: s_waitcnt vmcnt(0)
23100 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23101 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23102 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
23103 ; GCN-NEXT: v_max_f32_e32 v17, v17, v32
23104 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
23105 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23106 ; GCN-NEXT: s_waitcnt vmcnt(0)
23107 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23108 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23109 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
23110 ; GCN-NEXT: v_max_f32_e32 v16, v16, v32
23111 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
23112 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23113 ; GCN-NEXT: s_waitcnt vmcnt(0)
23114 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23115 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23116 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
23117 ; GCN-NEXT: v_max_f32_e32 v15, v15, v32
23118 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
23119 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23120 ; GCN-NEXT: s_waitcnt vmcnt(0)
23121 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23122 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23123 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
23124 ; GCN-NEXT: v_max_f32_e32 v14, v14, v32
23125 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
23126 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23127 ; GCN-NEXT: s_waitcnt vmcnt(0)
23128 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23129 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23130 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
23131 ; GCN-NEXT: v_max_f32_e32 v13, v13, v32
23132 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
23133 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23134 ; GCN-NEXT: s_waitcnt vmcnt(0)
23135 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23136 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23137 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
23138 ; GCN-NEXT: v_max_f32_e32 v12, v12, v32
23139 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
23140 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23141 ; GCN-NEXT: s_waitcnt vmcnt(0)
23142 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23143 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23144 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
23145 ; GCN-NEXT: v_max_f32_e32 v11, v11, v32
23146 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
23147 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23148 ; GCN-NEXT: s_waitcnt vmcnt(0)
23149 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23150 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23151 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
23152 ; GCN-NEXT: v_max_f32_e32 v10, v10, v32
23153 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
23154 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23155 ; GCN-NEXT: s_waitcnt vmcnt(0)
23156 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23157 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23158 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
23159 ; GCN-NEXT: v_max_f32_e32 v9, v9, v32
23160 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
23161 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23162 ; GCN-NEXT: s_waitcnt vmcnt(0)
23163 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23164 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23165 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
23166 ; GCN-NEXT: v_max_f32_e32 v8, v8, v32
23167 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
23168 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23169 ; GCN-NEXT: s_waitcnt vmcnt(0)
23170 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23171 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23172 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
23173 ; GCN-NEXT: v_max_f32_e32 v7, v7, v32
23174 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
23175 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23176 ; GCN-NEXT: s_waitcnt vmcnt(0)
23177 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23178 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23179 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
23180 ; GCN-NEXT: v_max_f32_e32 v6, v6, v32
23181 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
23182 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23183 ; GCN-NEXT: s_waitcnt vmcnt(0)
23184 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23185 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23186 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
23187 ; GCN-NEXT: v_max_f32_e32 v5, v5, v32
23188 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
23189 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23190 ; GCN-NEXT: s_waitcnt vmcnt(0)
23191 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23192 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23193 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
23194 ; GCN-NEXT: v_max_f32_e32 v4, v4, v32
23195 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
23196 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23197 ; GCN-NEXT: s_waitcnt vmcnt(0)
23198 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23199 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23200 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
23201 ; GCN-NEXT: v_max_f32_e32 v3, v3, v32
23202 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
23203 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23204 ; GCN-NEXT: s_waitcnt vmcnt(0)
23205 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23206 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23207 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
23208 ; GCN-NEXT: v_max_f32_e32 v2, v2, v32
23209 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
23210 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23211 ; GCN-NEXT: s_waitcnt vmcnt(0)
23212 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23213 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23214 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
23215 ; GCN-NEXT: v_max_f32_e32 v1, v1, v32
23216 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
23217 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23218 ; GCN-NEXT: s_waitcnt vmcnt(0)
23219 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
23220 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23221 ; GCN-NEXT: v_max_f32_e32 v0, v0, v32
23222 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23223 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23224 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23225 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23226 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23227 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23228 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23229 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23230 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23231 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23232 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23233 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23234 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23235 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23236 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23237 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23238 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23239 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23240 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23241 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23242 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23243 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23244 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23245 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23246 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23247 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23248 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23249 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23250 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23251 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23252 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23253 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23254 ; GCN-NEXT: s_setpc_b64 s[30:31]
23256 ; GFX7-LABEL: v_maxnum_v32bf16:
23258 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23259 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
23260 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
23261 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
23262 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23263 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
23264 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23265 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
23266 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23267 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
23268 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23269 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
23270 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23271 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
23272 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23273 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
23274 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23275 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
23276 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23277 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
23278 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23279 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
23280 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23281 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
23282 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23283 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
23284 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23285 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
23286 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23287 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
23288 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23289 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
23290 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23291 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
23292 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23293 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
23294 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23295 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
23296 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23297 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
23298 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23299 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
23300 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23301 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
23302 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23303 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
23304 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23305 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
23306 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23307 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
23308 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23309 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
23310 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23311 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
23312 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23313 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
23314 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23315 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
23316 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23317 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
23318 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23319 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
23320 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23321 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
23322 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23323 ; GFX7-NEXT: s_waitcnt vmcnt(1)
23324 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
23325 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23326 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23327 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23328 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23329 ; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
23330 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
23331 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
23332 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23333 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23334 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23335 ; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
23336 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
23337 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23338 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23339 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23340 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23341 ; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
23342 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
23343 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23344 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23345 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23346 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23347 ; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
23348 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
23349 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23350 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23351 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23352 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23353 ; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
23354 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
23355 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23356 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23357 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23358 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23359 ; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
23360 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
23361 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23362 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23363 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23364 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23365 ; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
23366 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
23367 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23368 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23369 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23370 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23371 ; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
23372 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
23373 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23374 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23375 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23376 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23377 ; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
23378 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
23379 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23380 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23381 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23382 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23383 ; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
23384 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
23385 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23386 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23387 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23388 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23389 ; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
23390 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
23391 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23392 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23393 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23394 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23395 ; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
23396 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
23397 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23398 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23399 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23400 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23401 ; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
23402 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
23403 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23404 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23405 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23406 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23407 ; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
23408 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
23409 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23410 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23411 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23412 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23413 ; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
23414 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
23415 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23416 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23417 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23418 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23419 ; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
23420 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
23421 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23422 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23423 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23424 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23425 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
23426 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
23427 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23428 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23429 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23430 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23431 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
23432 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
23433 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23434 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23435 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23436 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23437 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
23438 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
23439 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23440 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23441 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23442 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23443 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
23444 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
23445 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23446 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23447 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23448 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23449 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
23450 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
23451 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23452 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23453 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23454 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23455 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
23456 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
23457 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23458 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23459 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23460 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23461 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
23462 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
23463 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23464 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23465 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23466 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23467 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
23468 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
23469 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23470 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23471 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23472 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23473 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
23474 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
23475 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23476 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23477 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23478 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23479 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
23480 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
23481 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23482 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23483 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23484 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23485 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
23486 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
23487 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23488 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23489 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23490 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23491 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
23492 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
23493 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23494 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23495 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23496 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23497 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
23498 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
23499 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23500 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23501 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23502 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23503 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
23504 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
23505 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23506 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23507 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23508 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23509 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
23510 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
23511 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23512 ; GFX7-NEXT: s_waitcnt vmcnt(0)
23513 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
23514 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
23515 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
23516 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23517 ; GFX7-NEXT: s_setpc_b64 s[30:31]
23519 ; GFX8-LABEL: v_maxnum_v32bf16:
23521 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23522 ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
23523 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
23524 ; GFX8-NEXT: v_max_f32_e32 v31, v32, v31
23525 ; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
23526 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
23527 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
23528 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23529 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23530 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
23531 ; GFX8-NEXT: v_max_f32_e32 v14, v14, v30
23532 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
23533 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
23534 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
23535 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
23536 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
23537 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
23538 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
23539 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
23540 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
23541 ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
23542 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
23543 ; GFX8-NEXT: v_max_f32_e32 v32, v32, v30
23544 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
23545 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
23546 ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23547 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23548 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23549 ; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
23550 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
23551 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
23552 ; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
23553 ; GFX8-NEXT: s_waitcnt vmcnt(0)
23554 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
23555 ; GFX8-NEXT: v_max_f32_e32 v33, v33, v34
23556 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23557 ; GFX8-NEXT: v_max_f32_e32 v30, v15, v30
23558 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
23559 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
23560 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
23561 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
23562 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
23563 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
23564 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
23565 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
23566 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23567 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
23568 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
23569 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
23570 ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
23571 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
23572 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23573 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
23574 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
23575 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
23576 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
23577 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
23578 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
23579 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
23580 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
23581 ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
23582 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
23583 ; GFX8-NEXT: v_max_f32_e32 v29, v33, v29
23584 ; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
23585 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
23586 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23587 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23588 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23589 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
23590 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
23591 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
23592 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
23593 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
23594 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
23595 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
23596 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
23597 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
23598 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
23599 ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
23600 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
23601 ; GFX8-NEXT: v_max_f32_e32 v28, v33, v28
23602 ; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
23603 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
23604 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23605 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23606 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23607 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
23608 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
23609 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
23610 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
23611 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
23612 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
23613 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
23614 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
23615 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
23616 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
23617 ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
23618 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
23619 ; GFX8-NEXT: v_max_f32_e32 v27, v33, v27
23620 ; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
23621 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
23622 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23623 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23624 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23625 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
23626 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
23627 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
23628 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
23629 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
23630 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
23631 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
23632 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
23633 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
23634 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
23635 ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
23636 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
23637 ; GFX8-NEXT: v_max_f32_e32 v26, v33, v26
23638 ; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
23639 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
23640 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23641 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23642 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23643 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
23644 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
23645 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
23646 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
23647 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
23648 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
23649 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
23650 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
23651 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
23652 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
23653 ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
23654 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
23655 ; GFX8-NEXT: v_max_f32_e32 v25, v33, v25
23656 ; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
23657 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
23658 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23659 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23660 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23661 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
23662 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
23663 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
23664 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
23665 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
23666 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
23667 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
23668 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
23669 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
23670 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
23671 ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
23672 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
23673 ; GFX8-NEXT: v_max_f32_e32 v24, v33, v24
23674 ; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
23675 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
23676 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23677 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23678 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23679 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
23680 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
23681 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
23682 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
23683 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
23684 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
23685 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
23686 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
23687 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
23688 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
23689 ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
23690 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
23691 ; GFX8-NEXT: v_max_f32_e32 v23, v33, v23
23692 ; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
23693 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
23694 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
23695 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
23696 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23697 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
23698 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
23699 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
23700 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
23701 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
23702 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
23703 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
23704 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
23705 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
23706 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
23707 ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
23708 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
23709 ; GFX8-NEXT: v_max_f32_e32 v22, v33, v22
23710 ; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
23711 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
23712 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
23713 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
23714 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23715 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v21
23716 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
23717 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
23718 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
23719 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
23720 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
23721 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
23722 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
23723 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
23724 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
23725 ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
23726 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
23727 ; GFX8-NEXT: v_max_f32_e32 v21, v33, v21
23728 ; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
23729 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
23730 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
23731 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
23732 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23733 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
23734 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
23735 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
23736 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
23737 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
23738 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
23739 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
23740 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
23741 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
23742 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
23743 ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
23744 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
23745 ; GFX8-NEXT: v_max_f32_e32 v20, v33, v20
23746 ; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
23747 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
23748 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
23749 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
23750 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23751 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
23752 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
23753 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
23754 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
23755 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
23756 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
23757 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
23758 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
23759 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
23760 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
23761 ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
23762 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
23763 ; GFX8-NEXT: v_max_f32_e32 v19, v33, v19
23764 ; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
23765 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
23766 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
23767 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
23768 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23769 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
23770 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
23771 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
23772 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
23773 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
23774 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
23775 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
23776 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
23777 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
23778 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
23779 ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
23780 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
23781 ; GFX8-NEXT: v_max_f32_e32 v18, v33, v18
23782 ; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
23783 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
23784 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
23785 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
23786 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23787 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
23788 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
23789 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
23790 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
23791 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
23792 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
23793 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
23794 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
23795 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
23796 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
23797 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
23798 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
23799 ; GFX8-NEXT: v_max_f32_e32 v17, v33, v17
23800 ; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
23801 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
23802 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
23803 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
23804 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
23805 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
23806 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
23807 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
23808 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
23809 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
23810 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
23811 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
23812 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
23813 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
23814 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
23815 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
23816 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
23817 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
23818 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
23819 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
23820 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
23821 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
23822 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
23823 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
23824 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
23825 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
23826 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
23827 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
23828 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
23829 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
23830 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
23831 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
23832 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
23833 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
23834 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
23835 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
23836 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
23837 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
23838 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
23839 ; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
23840 ; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
23841 ; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
23842 ; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
23843 ; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
23844 ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
23845 ; GFX8-NEXT: s_setpc_b64 s[30:31]
23847 ; GFX9-LABEL: v_maxnum_v32bf16:
23849 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23850 ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
23851 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
23852 ; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
23853 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
23854 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
23855 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
23856 ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
23857 ; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
23858 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
23859 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
23860 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
23861 ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
23862 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
23863 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
23864 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
23865 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
23866 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
23867 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
23868 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
23869 ; GFX9-NEXT: v_max_f32_e32 v30, v32, v30
23870 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23871 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
23872 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
23873 ; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
23874 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
23875 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
23876 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
23877 ; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
23878 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
23879 ; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
23880 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
23881 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
23882 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
23883 ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
23884 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
23885 ; GFX9-NEXT: v_max_f32_e32 v32, v32, v29
23886 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
23887 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
23888 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
23889 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
23890 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
23891 ; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
23892 ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
23893 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
23894 ; GFX9-NEXT: s_waitcnt vmcnt(0)
23895 ; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
23896 ; GFX9-NEXT: v_max_f32_e32 v33, v33, v34
23897 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
23898 ; GFX9-NEXT: v_max_f32_e32 v29, v15, v29
23899 ; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
23900 ; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
23901 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
23902 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
23903 ; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
23904 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
23905 ; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
23906 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
23907 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
23908 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
23909 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
23910 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
23911 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
23912 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
23913 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
23914 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
23915 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
23916 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
23917 ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
23918 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
23919 ; GFX9-NEXT: v_max_f32_e32 v28, v33, v28
23920 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
23921 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
23922 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
23923 ; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
23924 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
23925 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
23926 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
23927 ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
23928 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
23929 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
23930 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
23931 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
23932 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
23933 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
23934 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
23935 ; GFX9-NEXT: v_max_f32_e32 v27, v33, v27
23936 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
23937 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
23938 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
23939 ; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
23940 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
23941 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
23942 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
23943 ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
23944 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
23945 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
23946 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
23947 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
23948 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
23949 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
23950 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
23951 ; GFX9-NEXT: v_max_f32_e32 v26, v33, v26
23952 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
23953 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
23954 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
23955 ; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
23956 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
23957 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
23958 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
23959 ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
23960 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
23961 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
23962 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
23963 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
23964 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
23965 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
23966 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
23967 ; GFX9-NEXT: v_max_f32_e32 v25, v33, v25
23968 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
23969 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
23970 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
23971 ; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
23972 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
23973 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
23974 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
23975 ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
23976 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
23977 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
23978 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
23979 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
23980 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
23981 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
23982 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
23983 ; GFX9-NEXT: v_max_f32_e32 v24, v33, v24
23984 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
23985 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
23986 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
23987 ; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
23988 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
23989 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
23990 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
23991 ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
23992 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
23993 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
23994 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
23995 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
23996 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
23997 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
23998 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
23999 ; GFX9-NEXT: v_max_f32_e32 v23, v33, v23
24000 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
24001 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
24002 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
24003 ; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
24004 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
24005 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
24006 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
24007 ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
24008 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
24009 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
24010 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
24011 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
24012 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
24013 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
24014 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
24015 ; GFX9-NEXT: v_max_f32_e32 v22, v33, v22
24016 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
24017 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
24018 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
24019 ; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
24020 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
24021 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
24022 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
24023 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
24024 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
24025 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
24026 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
24027 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
24028 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
24029 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
24030 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
24031 ; GFX9-NEXT: v_max_f32_e32 v21, v33, v21
24032 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
24033 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
24034 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
24035 ; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
24036 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
24037 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
24038 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
24039 ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
24040 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
24041 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
24042 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
24043 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
24044 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
24045 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
24046 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
24047 ; GFX9-NEXT: v_max_f32_e32 v20, v33, v20
24048 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
24049 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
24050 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
24051 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
24052 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
24053 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
24054 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
24055 ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
24056 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
24057 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
24058 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
24059 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
24060 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
24061 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
24062 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
24063 ; GFX9-NEXT: v_max_f32_e32 v19, v33, v19
24064 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
24065 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
24066 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
24067 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
24068 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
24069 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
24070 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
24071 ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
24072 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
24073 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
24074 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
24075 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
24076 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
24077 ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
24078 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
24079 ; GFX9-NEXT: v_max_f32_e32 v18, v33, v18
24080 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
24081 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
24082 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
24083 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
24084 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
24085 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
24086 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
24087 ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
24088 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
24089 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
24090 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
24091 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
24092 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
24093 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
24094 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
24095 ; GFX9-NEXT: v_max_f32_e32 v17, v33, v17
24096 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
24097 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24098 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
24099 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
24100 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
24101 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
24102 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
24103 ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
24104 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
24105 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
24106 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
24107 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24108 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
24109 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
24110 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
24111 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
24112 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
24113 ; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
24114 ; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
24115 ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
24116 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
24117 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
24118 ; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
24119 ; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
24120 ; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
24121 ; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
24122 ; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
24123 ; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
24124 ; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
24125 ; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
24126 ; GFX9-NEXT: s_setpc_b64 s[30:31]
24128 ; GFX10-LABEL: v_maxnum_v32bf16:
24130 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24131 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
24132 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
24133 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
24134 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
24135 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
24136 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
24137 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
24138 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
24139 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
24140 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
24141 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
24142 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
24143 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
24144 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
24145 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
24146 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
24147 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
24148 ; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
24149 ; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
24150 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
24151 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
24152 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
24153 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
24154 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
24155 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
24156 ; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
24157 ; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
24158 ; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
24159 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
24160 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
24161 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
24162 ; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
24163 ; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
24164 ; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
24165 ; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
24166 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
24167 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
24168 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
24169 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
24170 ; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
24171 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
24172 ; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
24173 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
24174 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
24175 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
24176 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
24177 ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
24178 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
24179 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
24180 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
24181 ; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
24182 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
24183 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24184 ; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
24185 ; GFX10-NEXT: v_max_f32_e32 v25, v54, v53
24186 ; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
24187 ; GFX10-NEXT: v_max_f32_e32 v24, v64, v55
24188 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
24189 ; GFX10-NEXT: v_max_f32_e32 v23, v66, v65
24190 ; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
24191 ; GFX10-NEXT: v_max_f32_e32 v22, v68, v67
24192 ; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
24193 ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
24194 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
24195 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
24196 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
24197 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
24198 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
24199 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
24200 ; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
24201 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
24202 ; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
24203 ; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
24204 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
24205 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
24206 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
24207 ; GFX10-NEXT: v_max_f32_e32 v18, v27, v48
24208 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
24209 ; GFX10-NEXT: v_max_f32_e32 v17, v26, v50
24210 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
24211 ; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
24212 ; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
24213 ; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
24214 ; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
24215 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
24216 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
24217 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
24218 ; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
24219 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
24220 ; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
24221 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
24222 ; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
24223 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
24224 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
24225 ; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
24226 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
24227 ; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
24228 ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
24229 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
24230 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
24231 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
24232 ; GFX10-NEXT: v_max_f32_e32 v19, v28, v38
24233 ; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
24234 ; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
24235 ; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
24236 ; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
24237 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
24238 ; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
24239 ; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
24240 ; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
24241 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
24242 ; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
24243 ; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
24244 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
24245 ; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
24246 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
24247 ; GFX10-NEXT: v_max_f32_e32 v20, v29, v36
24248 ; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
24249 ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
24250 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
24251 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
24252 ; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
24253 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
24254 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
24255 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
24256 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
24257 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
24258 ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
24259 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
24260 ; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
24261 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
24262 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
24263 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
24264 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
24265 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
24266 ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
24267 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
24268 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
24269 ; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
24270 ; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
24271 ; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
24272 ; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
24273 ; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
24274 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
24275 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
24276 ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
24277 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
24278 ; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
24279 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
24280 ; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
24281 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
24282 ; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
24283 ; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
24284 ; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
24285 ; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
24286 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
24287 ; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
24288 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
24289 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
24290 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
24291 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
24292 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
24293 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
24294 ; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
24295 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
24296 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
24297 ; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
24298 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
24299 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
24300 ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
24301 ; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
24302 ; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
24303 ; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
24304 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
24305 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
24306 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
24307 ; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
24308 ; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
24309 ; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
24310 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
24311 ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
24312 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
24313 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
24314 ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
24315 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
24316 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
24317 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
24318 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
24319 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
24320 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
24321 ; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
24322 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
24323 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
24324 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
24325 ; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
24326 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
24327 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
24328 ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
24329 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
24330 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
24331 ; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
24332 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
24333 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
24334 ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
24335 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
24336 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
24337 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
24338 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
24339 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
24340 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
24341 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
24342 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
24343 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
24344 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
24345 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
24346 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
24347 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
24348 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
24349 ; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
24350 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
24351 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
24352 ; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
24353 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
24354 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
24355 ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
24356 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
24357 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
24358 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
24359 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
24360 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
24361 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
24362 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
24363 ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
24364 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
24365 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
24366 ; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
24367 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
24368 ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
24369 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
24370 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
24371 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
24372 ; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
24373 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
24374 ; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
24375 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
24376 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
24377 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
24378 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
24379 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
24380 ; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
24381 ; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
24382 ; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
24383 ; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
24384 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
24385 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
24386 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
24387 ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
24388 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
24389 ; GFX10-NEXT: s_waitcnt vmcnt(0)
24390 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
24391 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
24392 ; GFX10-NEXT: v_max_f32_e32 v17, v31, v17
24393 ; GFX10-NEXT: v_max_f32_e32 v15, v15, v18
24394 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
24395 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
24396 ; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
24397 ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
24398 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24399 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
24400 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
24401 ; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
24402 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
24403 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
24404 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
24405 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24407 ; GFX11-LABEL: v_maxnum_v32bf16:
24409 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24410 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
24411 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21
24412 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5
24413 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
24414 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
24415 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17
24416 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1
24417 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
24418 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
24419 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26
24420 ; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
24421 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24
24422 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24423 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
24424 ; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19
24425 ; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1
24426 ; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
24427 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18
24428 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1
24429 ; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5
24430 ; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1
24431 ; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
24432 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3
24433 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
24434 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
24435 ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2
24436 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9
24437 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24438 ; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
24439 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16
24440 ; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
24441 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
24442 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22
24443 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6
24444 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24445 ; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1
24446 ; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19
24447 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11
24448 ; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1
24449 ; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25
24450 ; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
24451 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0
24452 ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
24453 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24454 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
24455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24456 ; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
24457 ; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
24458 ; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3
24459 ; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
24460 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24461 ; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1
24462 ; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17
24463 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
24464 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
24465 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4
24466 ; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
24467 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
24468 ; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23
24469 ; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
24470 ; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10
24471 ; GFX11-NEXT: v_max_f32_e32 v2, v2, v18
24472 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v16
24473 ; GFX11-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
24474 ; GFX11-NEXT: v_max_f32_e32 v7, v7, v23
24475 ; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
24476 ; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
24477 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
24478 ; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1
24479 ; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1
24480 ; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24
24481 ; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23
24482 ; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1
24483 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
24484 ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20
24485 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
24486 ; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
24487 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
24488 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
24489 ; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7
24490 ; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
24491 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
24492 ; GFX11-NEXT: v_max_f32_e32 v4, v4, v20
24493 ; GFX11-NEXT: v_max_f32_e32 v20, v80, v71
24494 ; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1
24495 ; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9
24496 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29
24497 ; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
24498 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24499 ; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
24500 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
24501 ; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
24502 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
24503 ; GFX11-NEXT: v_max_f32_e32 v26, v52, v51
24504 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24505 ; GFX11-NEXT: v_max_f32_e32 v6, v6, v22
24506 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13
24507 ; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
24508 ; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
24509 ; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
24510 ; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
24511 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
24512 ; GFX11-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
24513 ; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
24514 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
24515 ; GFX11-NEXT: v_max_f32_e32 v29, v38, v37
24516 ; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15
24517 ; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
24518 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
24519 ; GFX11-NEXT: v_max_f32_e32 v14, v14, v30
24520 ; GFX11-NEXT: v_max_f32_e32 v28, v48, v39
24521 ; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
24522 ; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1
24523 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1
24524 ; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14
24525 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
24526 ; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1
24527 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1
24528 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33
24529 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
24530 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
24531 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30
24532 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
24533 ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
24534 ; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13
24535 ; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1
24536 ; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
24537 ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
24538 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
24539 ; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29
24540 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1
24541 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
24542 ; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12
24543 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
24544 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
24545 ; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1
24546 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
24547 ; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28
24548 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1
24549 ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
24550 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
24551 ; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
24552 ; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11
24553 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1
24554 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
24555 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
24556 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
24557 ; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27
24558 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1
24559 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
24560 ; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10
24561 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
24562 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
24563 ; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1
24564 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
24565 ; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26
24566 ; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1
24567 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
24568 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
24569 ; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
24570 ; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25
24571 ; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1
24572 ; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
24573 ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
24574 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
24575 ; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8
24576 ; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
24577 ; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1
24578 ; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6
24579 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
24580 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
24581 ; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1
24582 ; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
24583 ; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22
24584 ; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1
24585 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
24586 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
24587 ; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
24588 ; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21
24589 ; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1
24590 ; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
24591 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
24592 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
24593 ; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4
24594 ; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1
24595 ; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
24596 ; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20
24597 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
24598 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
24599 ; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
24600 ; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1
24601 ; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18
24602 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1
24603 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
24604 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
24605 ; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
24606 ; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0
24607 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
24608 ; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1
24609 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
24610 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
24611 ; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2
24612 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
24613 ; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
24614 ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
24615 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
24616 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
24617 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
24618 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
24619 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
24620 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
24621 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
24622 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
24623 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
24624 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
24625 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
24626 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24627 ; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
24628 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
24629 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
24630 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
24631 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
24632 ; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
24633 ; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
24634 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
24635 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
24636 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
24637 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24638 ; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
24639 ; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
24640 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
24641 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
24642 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
24643 ; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
24644 ; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
24645 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
24646 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
24647 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
24648 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
24649 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
24650 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
24651 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24652 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
24653 ; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
24654 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
24655 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24656 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
24657 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
24658 ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
24659 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
24660 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
24661 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
24662 ; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
24663 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
24664 ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
24665 ; GFX11-NEXT: s_waitcnt vmcnt(0)
24666 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32
24667 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24668 ; GFX11-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
24669 ; GFX11-NEXT: v_max_f32_e32 v15, v15, v18
24670 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
24671 ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1
24672 ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1
24673 ; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
24674 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
24675 ; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15
24676 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
24677 ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
24678 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24679 ; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
24680 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
24681 ; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
24682 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
24683 ; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
24684 ; GFX11-NEXT: s_setpc_b64 s[30:31]
24685 %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
24686 ret <32 x bfloat> %op
24689 declare bfloat @llvm.sqrt.bf16(bfloat)
24691 define bfloat @v_sqrt_bf16(bfloat %a) {
24692 ; GCN-LABEL: v_sqrt_bf16:
24694 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24695 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
24696 ; GCN-NEXT: s_mov_b32 s4, 0xf800000
24697 ; GCN-NEXT: v_mov_b32_e32 v1, 0x260
24698 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24699 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
24700 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24701 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
24702 ; GCN-NEXT: v_sqrt_f32_e32 v2, v0
24703 ; GCN-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
24704 ; GCN-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
24705 ; GCN-NEXT: v_fma_f32 v5, -v3, v2, v0
24706 ; GCN-NEXT: v_fma_f32 v6, -v4, v2, v0
24707 ; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
24708 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
24709 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
24710 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
24711 ; GCN-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
24712 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
24713 ; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
24714 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
24715 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24716 ; GCN-NEXT: s_setpc_b64 s[30:31]
24718 ; GFX7-LABEL: v_sqrt_bf16:
24720 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24721 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
24722 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24723 ; GFX7-NEXT: s_mov_b32 s4, 0xf800000
24724 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24725 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24726 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24727 ; GFX7-NEXT: v_sqrt_f32_e32 v1, v0
24728 ; GFX7-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
24729 ; GFX7-NEXT: v_fma_f32 v3, -v2, v1, v0
24730 ; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
24731 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24732 ; GFX7-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
24733 ; GFX7-NEXT: v_fma_f32 v1, -v3, v1, v0
24734 ; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
24735 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24736 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24737 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
24738 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
24739 ; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
24740 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24741 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24742 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24744 ; GFX8-LABEL: v_sqrt_bf16:
24746 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24747 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24748 ; GFX8-NEXT: s_mov_b32 s4, 0xf800000
24749 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24750 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24751 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24752 ; GFX8-NEXT: v_sqrt_f32_e32 v1, v0
24753 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], -1, v1
24754 ; GFX8-NEXT: v_fma_f32 v3, -v2, v1, v0
24755 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
24756 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24757 ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], 1, v1
24758 ; GFX8-NEXT: v_fma_f32 v1, -v3, v1, v0
24759 ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
24760 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24761 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24762 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
24763 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x260
24764 ; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
24765 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24766 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
24767 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
24768 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
24769 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
24770 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24771 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24772 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24773 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24775 ; GFX9-LABEL: v_sqrt_bf16:
24777 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24778 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24779 ; GFX9-NEXT: s_mov_b32 s4, 0xf800000
24780 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24781 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
24782 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24783 ; GFX9-NEXT: v_sqrt_f32_e32 v1, v0
24784 ; GFX9-NEXT: v_add_u32_e32 v2, -1, v1
24785 ; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0
24786 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
24787 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1
24788 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
24789 ; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0
24790 ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
24791 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
24792 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24793 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
24794 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260
24795 ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
24796 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24797 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
24798 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
24799 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
24800 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
24801 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24802 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24803 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24804 ; GFX9-NEXT: s_setpc_b64 s[30:31]
24806 ; GFX10-LABEL: v_sqrt_bf16:
24808 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24809 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24810 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24811 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
24812 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
24813 ; GFX10-NEXT: v_sqrt_f32_e32 v1, v0
24814 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v1
24815 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v1
24816 ; GFX10-NEXT: v_fma_f32 v4, -v2, v1, v0
24817 ; GFX10-NEXT: v_fma_f32 v5, -v3, v1, v0
24818 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, 0, v4
24819 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4
24820 ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5
24821 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
24822 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24823 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
24824 ; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
24825 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
24826 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
24827 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
24828 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24829 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24830 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24831 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24832 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24834 ; GFX11-LABEL: v_sqrt_bf16:
24836 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24837 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24838 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
24839 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
24840 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
24841 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
24842 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
24843 ; GFX11-NEXT: v_sqrt_f32_e32 v1, v0
24844 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
24845 ; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v1
24846 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1
24847 ; GFX11-NEXT: v_fma_f32 v4, -v2, v1, v0
24848 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
24849 ; GFX11-NEXT: v_fma_f32 v5, -v3, v1, v0
24850 ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
24851 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
24852 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
24853 ; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
24854 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24855 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
24856 ; GFX11-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
24857 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
24858 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
24859 ; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
24860 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
24861 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
24862 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
24863 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
24864 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24865 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24866 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24867 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24868 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24869 ; GFX11-NEXT: s_setpc_b64 s[30:31]
24870 %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
24874 declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
24876 define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
24877 ; GCN-LABEL: v_ldexp_bf16_i32:
24879 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24880 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
24881 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24882 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
24883 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24884 ; GCN-NEXT: s_setpc_b64 s[30:31]
24886 ; GFX7-LABEL: v_ldexp_bf16_i32:
24888 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24889 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
24890 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24891 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
24892 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24893 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24895 ; GFX8-LABEL: v_ldexp_bf16_i32:
24897 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24898 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24899 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
24900 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
24901 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
24902 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
24903 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
24904 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24905 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24906 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24907 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24909 ; GFX9-LABEL: v_ldexp_bf16_i32:
24911 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24912 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24913 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
24914 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
24915 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
24916 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
24917 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
24918 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24919 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
24920 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24921 ; GFX9-NEXT: s_setpc_b64 s[30:31]
24923 ; GFX10-LABEL: v_ldexp_bf16_i32:
24925 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24926 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24927 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
24928 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
24929 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
24930 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24931 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24932 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24933 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24934 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24936 ; GFX11-LABEL: v_ldexp_bf16_i32:
24938 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24939 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
24940 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
24941 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
24942 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
24943 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
24944 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
24945 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
24946 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
24947 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
24948 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
24949 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24950 ; GFX11-NEXT: s_setpc_b64 s[30:31]
24951 %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
24955 declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
24957 define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
24958 ; GCN-LABEL: v_frexp_bf16_i16:
24960 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24961 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
24962 ; GCN-NEXT: s_mov_b32 s4, 0x7f800000
24963 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24964 ; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0
24965 ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
24966 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
24967 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
24968 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
24969 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24970 ; GCN-NEXT: s_setpc_b64 s[30:31]
24972 ; GFX7-LABEL: v_frexp_bf16_i16:
24974 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24975 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
24976 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24977 ; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
24978 ; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0
24979 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
24980 ; GFX7-NEXT: s_setpc_b64 s[30:31]
24982 ; GFX8-LABEL: v_frexp_bf16_i16:
24984 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24985 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
24986 ; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v1
24987 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
24988 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
24989 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
24990 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
24991 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
24992 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
24993 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
24994 ; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
24995 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24997 ; GFX9-LABEL: v_frexp_bf16_i16:
24999 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25000 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
25001 ; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1
25002 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
25003 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25004 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
25005 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
25006 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25007 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
25008 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25009 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
25010 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25012 ; GFX10-LABEL: v_frexp_bf16_i16:
25014 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25015 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
25016 ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1
25017 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
25018 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
25019 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
25020 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25021 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
25022 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
25023 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25024 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25026 ; GFX11-LABEL: v_frexp_bf16_i16:
25028 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25029 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
25030 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25031 ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v1
25032 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
25033 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
25034 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25035 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25036 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
25037 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
25038 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
25039 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
25040 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25041 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25042 %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
25043 ret { bfloat, i16 } %op
25047 declare bfloat @llvm.log.bf16(bfloat)
25048 declare bfloat @llvm.log2.bf16(bfloat)
25049 declare bfloat @llvm.log10.bf16(bfloat)
25051 define bfloat @v_log_bf16(bfloat %a) {
25052 ; GCN-LABEL: v_log_bf16:
25054 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25055 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25056 ; GCN-NEXT: s_mov_b32 s4, 0x800000
25057 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
25058 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000
25059 ; GCN-NEXT: v_mov_b32_e32 v2, 0x41b17218
25060 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25061 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25062 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25063 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
25064 ; GCN-NEXT: v_log_f32_e32 v0, v0
25065 ; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
25066 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v1
25067 ; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
25068 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
25069 ; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
25070 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
25071 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25072 ; GCN-NEXT: v_add_f32_e32 v3, v5, v3
25073 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3
25074 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
25075 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25076 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
25077 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
25078 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25079 ; GCN-NEXT: s_setpc_b64 s[30:31]
25081 ; GFX7-LABEL: v_log_bf16:
25083 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25084 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25085 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25086 ; GFX7-NEXT: s_mov_b32 s4, 0x800000
25087 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
25088 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25089 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25090 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
25091 ; GFX7-NEXT: v_log_f32_e32 v0, v0
25092 ; GFX7-NEXT: s_mov_b32 s4, 0x3f317217
25093 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25094 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25095 ; GFX7-NEXT: s_mov_b32 s4, 0x3377d1cf
25096 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25097 ; GFX7-NEXT: s_mov_b32 s4, 0x7f800000
25098 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25099 ; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25100 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25101 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218
25102 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25103 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
25104 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25105 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25107 ; GFX8-LABEL: v_log_bf16:
25109 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25110 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25111 ; GFX8-NEXT: s_mov_b32 s4, 0x800000
25112 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
25113 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25114 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25115 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
25116 ; GFX8-NEXT: v_log_f32_e32 v0, v0
25117 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
25118 ; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
25119 ; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
25120 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x3f317000, v2
25121 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2
25122 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
25123 ; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
25124 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
25125 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
25126 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
25127 ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25128 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25129 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x41b17218
25130 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25131 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
25132 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25133 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25134 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25135 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25136 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25137 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25138 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25139 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25141 ; GFX9-LABEL: v_log_bf16:
25143 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25144 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25145 ; GFX9-NEXT: s_mov_b32 s4, 0x800000
25146 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
25147 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25148 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25149 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
25150 ; GFX9-NEXT: v_log_f32_e32 v0, v0
25151 ; GFX9-NEXT: s_mov_b32 s4, 0x3f317217
25152 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25153 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
25154 ; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf
25155 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
25156 ; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
25157 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
25158 ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25159 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25160 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218
25161 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25162 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
25163 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25164 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25165 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25166 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25167 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25168 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25169 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25170 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25172 ; GFX10-LABEL: v_log_bf16:
25174 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25175 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25176 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25177 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
25178 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
25179 ; GFX10-NEXT: v_log_f32_e32 v0, v0
25180 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25181 ; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
25182 ; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
25183 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
25184 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
25185 ; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25186 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25187 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
25188 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25189 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25190 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25191 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25192 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25193 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25194 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25196 ; GFX11-LABEL: v_log_bf16:
25198 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25199 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25200 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
25201 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25202 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
25203 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
25204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
25205 ; GFX11-NEXT: v_log_f32_e32 v0, v0
25206 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25207 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
25208 ; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
25209 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25210 ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
25211 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
25212 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
25213 ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25214 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25215 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25216 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
25217 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25218 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25219 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25220 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25221 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25222 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25223 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25224 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25225 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25226 %op = call bfloat @llvm.log.bf16(bfloat %a)
25230 define bfloat @v_log2_bf16(bfloat %a) {
25231 ; GCN-LABEL: v_log2_bf16:
25233 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25234 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25235 ; GCN-NEXT: s_mov_b32 s4, 0x800000
25236 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
25237 ; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000
25238 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25239 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25240 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25241 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
25242 ; GCN-NEXT: v_log_f32_e32 v0, v0
25243 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
25244 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
25245 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25246 ; GCN-NEXT: s_setpc_b64 s[30:31]
25248 ; GFX7-LABEL: v_log2_bf16:
25250 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25251 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25252 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25253 ; GFX7-NEXT: s_mov_b32 s4, 0x800000
25254 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
25255 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25256 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25257 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
25258 ; GFX7-NEXT: v_log_f32_e32 v0, v0
25259 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
25260 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25261 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
25262 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25263 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25265 ; GFX8-LABEL: v_log2_bf16:
25267 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25268 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25269 ; GFX8-NEXT: s_mov_b32 s4, 0x800000
25270 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
25271 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25272 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25273 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
25274 ; GFX8-NEXT: v_log_f32_e32 v0, v0
25275 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
25276 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25277 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
25278 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25279 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25280 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25281 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25282 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25283 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25284 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25285 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25287 ; GFX9-LABEL: v_log2_bf16:
25289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25290 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25291 ; GFX9-NEXT: s_mov_b32 s4, 0x800000
25292 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25293 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000
25294 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
25295 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
25296 ; GFX9-NEXT: v_log_f32_e32 v0, v0
25297 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
25298 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25299 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25300 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
25301 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25302 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25303 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25304 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25305 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25306 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25307 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25309 ; GFX10-LABEL: v_log2_bf16:
25311 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25312 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25313 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25314 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
25315 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
25316 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
25317 ; GFX10-NEXT: v_log_f32_e32 v0, v0
25318 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
25319 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25320 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25321 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25322 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25323 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25324 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25325 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25327 ; GFX11-LABEL: v_log2_bf16:
25329 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25330 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
25332 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25333 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
25334 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
25335 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
25336 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
25337 ; GFX11-NEXT: v_log_f32_e32 v0, v0
25338 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25339 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
25340 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25341 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25342 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25344 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25345 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25346 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
25347 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25348 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25349 %op = call bfloat @llvm.log2.bf16(bfloat %a)
25353 define bfloat @v_log10_bf16(bfloat %a) {
25354 ; GCN-LABEL: v_log10_bf16:
25356 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25357 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25358 ; GCN-NEXT: s_mov_b32 s4, 0x800000
25359 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
25360 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000
25361 ; GCN-NEXT: v_mov_b32_e32 v2, 0x411a209b
25362 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25363 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25364 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25365 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
25366 ; GCN-NEXT: v_log_f32_e32 v0, v0
25367 ; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
25368 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v1
25369 ; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
25370 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
25371 ; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
25372 ; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
25373 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25374 ; GCN-NEXT: v_add_f32_e32 v3, v5, v3
25375 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3
25376 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
25377 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25378 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
25379 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
25380 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25381 ; GCN-NEXT: s_setpc_b64 s[30:31]
25383 ; GFX7-LABEL: v_log10_bf16:
25385 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25386 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25387 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25388 ; GFX7-NEXT: s_mov_b32 s4, 0x800000
25389 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
25390 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25391 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25392 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
25393 ; GFX7-NEXT: v_log_f32_e32 v0, v0
25394 ; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a
25395 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25396 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25397 ; GFX7-NEXT: s_mov_b32 s4, 0x3284fbcf
25398 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25399 ; GFX7-NEXT: s_mov_b32 s4, 0x7f800000
25400 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25401 ; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25402 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25403 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b
25404 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25405 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
25406 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25407 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25409 ; GFX8-LABEL: v_log10_bf16:
25411 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25412 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25413 ; GFX8-NEXT: s_mov_b32 s4, 0x800000
25414 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
25415 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25416 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25417 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
25418 ; GFX8-NEXT: v_log_f32_e32 v0, v0
25419 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
25420 ; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
25421 ; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
25422 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v2
25423 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2
25424 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
25425 ; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
25426 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
25427 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
25428 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
25429 ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25430 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25431 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x411a209b
25432 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25433 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
25434 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25435 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25436 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25437 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25438 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25439 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25440 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25441 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25443 ; GFX9-LABEL: v_log10_bf16:
25445 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25446 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25447 ; GFX9-NEXT: s_mov_b32 s4, 0x800000
25448 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
25449 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25450 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25451 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
25452 ; GFX9-NEXT: v_log_f32_e32 v0, v0
25453 ; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a
25454 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25455 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
25456 ; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf
25457 ; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
25458 ; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
25459 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
25460 ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
25461 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
25462 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b
25463 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25464 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
25465 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25466 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25467 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25468 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25469 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25470 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25471 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25472 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25474 ; GFX10-LABEL: v_log10_bf16:
25476 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25477 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25478 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25479 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
25480 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
25481 ; GFX10-NEXT: v_log_f32_e32 v0, v0
25482 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25483 ; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
25484 ; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
25485 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
25486 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
25487 ; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25488 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25489 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
25490 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25491 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25492 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25493 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25494 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25495 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25496 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25498 ; GFX11-LABEL: v_log10_bf16:
25500 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25501 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25502 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
25503 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
25504 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
25505 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
25506 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
25507 ; GFX11-NEXT: v_log_f32_e32 v0, v0
25508 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25509 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
25510 ; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
25511 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25512 ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
25513 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
25514 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
25515 ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
25516 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25517 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
25518 ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
25519 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25520 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25521 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25522 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25523 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25524 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25525 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25526 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25527 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25528 %op = call bfloat @llvm.log10.bf16(bfloat %a)
25532 declare bfloat @llvm.exp.bf16(bfloat)
25533 declare bfloat @llvm.exp2.bf16(bfloat)
25534 declare bfloat @llvm.exp10.bf16(bfloat)
25536 define bfloat @v_exp_bf16(bfloat %a) {
25537 ; GCN-LABEL: v_exp_bf16:
25539 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25540 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25541 ; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25542 ; GCN-NEXT: s_mov_b32 s5, 0x42b17218
25543 ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
25544 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25545 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0
25546 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
25547 ; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
25548 ; GCN-NEXT: v_rndne_f32_e32 v5, v2
25549 ; GCN-NEXT: v_mul_f32_e32 v6, 0x39a3b295, v3
25550 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
25551 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
25552 ; GCN-NEXT: v_add_f32_e32 v3, v3, v6
25553 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
25554 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25555 ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
25556 ; GCN-NEXT: v_exp_f32_e32 v2, v2
25557 ; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5
25558 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25559 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
25560 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
25561 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25562 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25563 ; GCN-NEXT: s_setpc_b64 s[30:31]
25565 ; GFX7-LABEL: v_exp_bf16:
25567 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25568 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25569 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25570 ; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b
25571 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25572 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25573 ; GFX7-NEXT: s_mov_b32 s4, 0x32a5705f
25574 ; GFX7-NEXT: v_rndne_f32_e32 v3, v1
25575 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25576 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
25577 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25578 ; GFX7-NEXT: v_exp_f32_e32 v1, v1
25579 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3
25580 ; GFX7-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25581 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25582 ; GFX7-NEXT: s_mov_b32 s4, 0x42b17218
25583 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2
25584 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25585 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
25586 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25587 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25588 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25589 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25591 ; GFX8-LABEL: v_exp_bf16:
25593 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25594 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25595 ; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0
25596 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v0
25597 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v3
25598 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
25599 ; GFX8-NEXT: v_rndne_f32_e32 v2, v1
25600 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
25601 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
25602 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2
25603 ; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
25604 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
25605 ; GFX8-NEXT: v_exp_f32_e32 v1, v1
25606 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
25607 ; GFX8-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25608 ; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25609 ; GFX8-NEXT: s_mov_b32 s4, 0x42b17218
25610 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
25611 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25612 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
25613 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25614 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25615 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25616 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25617 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25618 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25619 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25620 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25621 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25622 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25624 ; GFX9-LABEL: v_exp_bf16:
25626 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25627 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25628 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25629 ; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b
25630 ; GFX9-NEXT: v_rndne_f32_e32 v2, v1
25631 ; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
25632 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
25633 ; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f
25634 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
25635 ; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
25636 ; GFX9-NEXT: v_exp_f32_e32 v1, v1
25637 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
25638 ; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0
25639 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25640 ; GFX9-NEXT: s_mov_b32 s4, 0x42b17218
25641 ; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
25642 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25643 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
25644 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25645 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25646 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25647 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25648 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25649 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25650 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25651 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25652 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25653 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25655 ; GFX10-LABEL: v_exp_bf16:
25657 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25658 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25659 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25660 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
25661 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1
25662 ; GFX10-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
25663 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
25664 ; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
25665 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
25666 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
25667 ; GFX10-NEXT: v_exp_f32_e32 v1, v1
25668 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
25669 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25670 ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
25671 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25672 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25673 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25674 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25675 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25676 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25677 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25678 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25680 ; GFX11-LABEL: v_exp_bf16:
25682 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25683 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25684 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25685 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
25686 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1
25687 ; GFX11-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
25688 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
25689 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2
25690 ; GFX11-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
25691 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
25692 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
25693 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25694 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
25695 ; GFX11-NEXT: v_exp_f32_e32 v1, v1
25696 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25697 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
25698 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
25699 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25700 ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
25701 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25702 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
25703 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25704 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25705 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25706 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25707 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25708 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25709 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25710 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25711 %op = call bfloat @llvm.exp.bf16(bfloat %a)
25715 define bfloat @v_exp2_bf16(bfloat %a) {
25716 ; GCN-LABEL: v_exp2_bf16:
25718 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25719 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25720 ; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
25721 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
25722 ; GCN-NEXT: v_mov_b32_e32 v2, 0x1f800000
25723 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25724 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25725 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25726 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1
25727 ; GCN-NEXT: v_exp_f32_e32 v0, v0
25728 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
25729 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
25730 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25731 ; GCN-NEXT: s_setpc_b64 s[30:31]
25733 ; GFX7-LABEL: v_exp2_bf16:
25735 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25736 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25737 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25738 ; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000
25739 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000
25740 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25741 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25742 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
25743 ; GFX7-NEXT: v_exp_f32_e32 v0, v0
25744 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000
25745 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25746 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
25747 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25748 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25750 ; GFX8-LABEL: v_exp2_bf16:
25752 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25753 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25754 ; GFX8-NEXT: s_mov_b32 s4, 0xc2fc0000
25755 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42800000
25756 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25757 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25758 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
25759 ; GFX8-NEXT: v_exp_f32_e32 v0, v0
25760 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
25761 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25762 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
25763 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25764 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25765 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25766 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25767 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25768 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25769 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25770 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25772 ; GFX9-LABEL: v_exp2_bf16:
25774 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25775 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25776 ; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000
25777 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
25778 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000
25779 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
25780 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
25781 ; GFX9-NEXT: v_exp_f32_e32 v0, v0
25782 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
25783 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
25784 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25785 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
25786 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25787 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25788 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25789 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25790 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25791 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25792 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25794 ; GFX10-LABEL: v_exp2_bf16:
25796 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25797 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25798 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
25799 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
25800 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
25801 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
25802 ; GFX10-NEXT: v_exp_f32_e32 v0, v0
25803 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
25804 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25805 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25806 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25807 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25808 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25809 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25810 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25812 ; GFX11-LABEL: v_exp2_bf16:
25814 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25815 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25816 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
25817 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
25818 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
25819 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
25820 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
25821 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
25822 ; GFX11-NEXT: v_exp_f32_e32 v0, v0
25823 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25824 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
25825 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
25826 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
25827 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25828 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25829 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25830 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25831 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
25832 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25833 ; GFX11-NEXT: s_setpc_b64 s[30:31]
25834 %op = call bfloat @llvm.exp2.bf16(bfloat %a)
25838 define bfloat @v_exp10_bf16(bfloat %a) {
25839 ; GCN-LABEL: v_exp10_bf16:
25841 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25842 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
25843 ; GCN-NEXT: s_mov_b32 s4, 0xc23369f4
25844 ; GCN-NEXT: s_mov_b32 s5, 0x421a209b
25845 ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
25846 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25847 ; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0
25848 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
25849 ; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
25850 ; GCN-NEXT: v_rndne_f32_e32 v5, v2
25851 ; GCN-NEXT: v_mul_f32_e32 v6, 0x3a2784bc, v3
25852 ; GCN-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
25853 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
25854 ; GCN-NEXT: v_add_f32_e32 v3, v3, v6
25855 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
25856 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3
25857 ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
25858 ; GCN-NEXT: v_exp_f32_e32 v2, v2
25859 ; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5
25860 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25861 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
25862 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
25863 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25864 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25865 ; GCN-NEXT: s_setpc_b64 s[30:31]
25867 ; GFX7-LABEL: v_exp10_bf16:
25869 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25870 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
25871 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25872 ; GFX7-NEXT: s_mov_b32 s4, 0x40549a78
25873 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25874 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
25875 ; GFX7-NEXT: s_mov_b32 s4, 0x33979a37
25876 ; GFX7-NEXT: v_rndne_f32_e32 v3, v1
25877 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
25878 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
25879 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
25880 ; GFX7-NEXT: v_exp_f32_e32 v1, v1
25881 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3
25882 ; GFX7-NEXT: s_mov_b32 s4, 0xc23369f4
25883 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25884 ; GFX7-NEXT: s_mov_b32 s4, 0x421a209b
25885 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2
25886 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25887 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
25888 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25889 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25890 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
25891 ; GFX7-NEXT: s_setpc_b64 s[30:31]
25893 ; GFX8-LABEL: v_exp10_bf16:
25895 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25896 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25897 ; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0
25898 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x40549000, v0
25899 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v3
25900 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
25901 ; GFX8-NEXT: v_rndne_f32_e32 v2, v1
25902 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
25903 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
25904 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2
25905 ; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
25906 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
25907 ; GFX8-NEXT: v_exp_f32_e32 v1, v1
25908 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
25909 ; GFX8-NEXT: s_mov_b32 s4, 0xc23369f4
25910 ; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25911 ; GFX8-NEXT: s_mov_b32 s4, 0x421a209b
25912 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
25913 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25914 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
25915 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25916 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25917 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
25918 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
25919 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
25920 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
25921 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25922 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25923 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25924 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25926 ; GFX9-LABEL: v_exp10_bf16:
25928 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25929 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25930 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25931 ; GFX9-NEXT: s_mov_b32 s4, 0x40549a78
25932 ; GFX9-NEXT: v_rndne_f32_e32 v2, v1
25933 ; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
25934 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
25935 ; GFX9-NEXT: s_mov_b32 s4, 0x33979a37
25936 ; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
25937 ; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
25938 ; GFX9-NEXT: v_exp_f32_e32 v1, v1
25939 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
25940 ; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4
25941 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
25942 ; GFX9-NEXT: s_mov_b32 s4, 0x421a209b
25943 ; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
25944 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
25945 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
25946 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
25947 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
25948 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
25949 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
25950 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
25951 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
25952 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
25953 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
25954 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25955 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25957 ; GFX10-LABEL: v_exp10_bf16:
25959 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25960 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25961 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25962 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
25963 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1
25964 ; GFX10-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
25965 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
25966 ; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
25967 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
25968 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
25969 ; GFX10-NEXT: v_exp_f32_e32 v1, v1
25970 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
25971 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
25972 ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
25973 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
25974 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
25975 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
25976 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
25977 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
25978 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
25979 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
25980 ; GFX10-NEXT: s_setpc_b64 s[30:31]
25982 ; GFX11-LABEL: v_exp10_bf16:
25984 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25985 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
25986 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25987 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
25988 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1
25989 ; GFX11-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
25990 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
25991 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2
25992 ; GFX11-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
25993 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
25994 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
25995 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
25996 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
25997 ; GFX11-NEXT: v_exp_f32_e32 v1, v1
25998 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
25999 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
26000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
26001 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
26002 ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
26003 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
26004 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
26005 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26006 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26007 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26008 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26009 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26010 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26011 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26012 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26013 %op = call bfloat @llvm.exp10.bf16(bfloat %a)
26017 declare bfloat @llvm.ceil.bf16(bfloat)
26019 define bfloat @v_ceil_bf16(bfloat %a) {
26020 ; GCN-LABEL: v_ceil_bf16:
26022 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26023 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26024 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26025 ; GCN-NEXT: v_ceil_f32_e32 v0, v0
26026 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26027 ; GCN-NEXT: s_setpc_b64 s[30:31]
26029 ; GFX7-LABEL: v_ceil_bf16:
26031 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26032 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26033 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26034 ; GFX7-NEXT: v_ceil_f32_e32 v0, v0
26035 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26036 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26038 ; GFX8-LABEL: v_ceil_bf16:
26040 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26041 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26042 ; GFX8-NEXT: v_ceil_f32_e32 v0, v0
26043 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26044 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26045 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26046 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26047 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26048 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26049 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26050 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26052 ; GFX9-LABEL: v_ceil_bf16:
26054 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26055 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26056 ; GFX9-NEXT: v_ceil_f32_e32 v0, v0
26057 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26058 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26059 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26060 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26061 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26062 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26063 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26064 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26066 ; GFX10-LABEL: v_ceil_bf16:
26068 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26069 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26070 ; GFX10-NEXT: v_ceil_f32_e32 v0, v0
26071 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26072 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26073 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26074 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26075 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26076 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26077 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26079 ; GFX11-LABEL: v_ceil_bf16:
26081 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26082 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26083 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26084 ; GFX11-NEXT: v_ceil_f32_e32 v0, v0
26085 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26086 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26087 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26088 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26089 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26090 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26091 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26092 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26093 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26094 %op = call bfloat @llvm.ceil.bf16(bfloat %a)
26098 declare bfloat @llvm.trunc.bf16(bfloat)
26100 define bfloat @v_trunc_bf16(bfloat %a) {
26101 ; GCN-LABEL: v_trunc_bf16:
26103 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26104 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26105 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26106 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
26107 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26108 ; GCN-NEXT: s_setpc_b64 s[30:31]
26110 ; GFX7-LABEL: v_trunc_bf16:
26112 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26113 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26114 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26115 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
26116 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26117 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26119 ; GFX8-LABEL: v_trunc_bf16:
26121 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26122 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26123 ; GFX8-NEXT: v_trunc_f32_e32 v0, v0
26124 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26125 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26126 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26127 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26128 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26129 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26130 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26131 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26133 ; GFX9-LABEL: v_trunc_bf16:
26135 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26136 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26137 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0
26138 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26139 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26140 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26141 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26142 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26143 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26144 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26145 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26147 ; GFX10-LABEL: v_trunc_bf16:
26149 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26150 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26151 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
26152 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26153 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26154 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26155 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26156 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26157 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26158 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26160 ; GFX11-LABEL: v_trunc_bf16:
26162 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26163 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26164 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26165 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
26166 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26167 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26168 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26169 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26170 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26171 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26172 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26173 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26174 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26175 %op = call bfloat @llvm.trunc.bf16(bfloat %a)
26179 declare bfloat @llvm.rint.bf16(bfloat)
26181 define bfloat @v_rint_bf16(bfloat %a) {
26182 ; GCN-LABEL: v_rint_bf16:
26184 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26185 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26186 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26187 ; GCN-NEXT: v_rndne_f32_e32 v0, v0
26188 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26189 ; GCN-NEXT: s_setpc_b64 s[30:31]
26191 ; GFX7-LABEL: v_rint_bf16:
26193 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26194 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26195 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26196 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0
26197 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26198 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26200 ; GFX8-LABEL: v_rint_bf16:
26202 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26203 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26204 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0
26205 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26206 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26207 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26208 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26209 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26210 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26211 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26212 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26214 ; GFX9-LABEL: v_rint_bf16:
26216 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26217 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26218 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0
26219 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26220 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26221 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26222 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26223 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26224 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26225 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26226 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26228 ; GFX10-LABEL: v_rint_bf16:
26230 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26231 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26232 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0
26233 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26234 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26235 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26236 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26237 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26238 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26239 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26241 ; GFX11-LABEL: v_rint_bf16:
26243 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26244 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26245 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26246 ; GFX11-NEXT: v_rndne_f32_e32 v0, v0
26247 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26248 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26249 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26250 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26251 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26252 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26253 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26254 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26255 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26256 %op = call bfloat @llvm.rint.bf16(bfloat %a)
26260 declare bfloat @llvm.nearbyint.bf16(bfloat)
26262 define bfloat @v_nearbyint_bf16(bfloat %a) {
26263 ; GCN-LABEL: v_nearbyint_bf16:
26265 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26266 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26267 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26268 ; GCN-NEXT: v_rndne_f32_e32 v0, v0
26269 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26270 ; GCN-NEXT: s_setpc_b64 s[30:31]
26272 ; GFX7-LABEL: v_nearbyint_bf16:
26274 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26275 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26276 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26277 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0
26278 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26279 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26281 ; GFX8-LABEL: v_nearbyint_bf16:
26283 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26284 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26285 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0
26286 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26287 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26288 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26289 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26290 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26291 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26292 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26293 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26295 ; GFX9-LABEL: v_nearbyint_bf16:
26297 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26298 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26299 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0
26300 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26301 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26302 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26303 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26304 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26305 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26306 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26307 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26309 ; GFX10-LABEL: v_nearbyint_bf16:
26311 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26312 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26313 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0
26314 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26315 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26316 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26317 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26318 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26319 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26320 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26322 ; GFX11-LABEL: v_nearbyint_bf16:
26324 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26325 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26326 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26327 ; GFX11-NEXT: v_rndne_f32_e32 v0, v0
26328 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26329 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26330 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26332 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26333 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26334 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26335 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26336 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26337 %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
26341 declare bfloat @llvm.round.bf16(bfloat)
26343 define bfloat @v_round_bf16(bfloat %a) {
26344 ; GCN-LABEL: v_round_bf16:
26346 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26347 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26348 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26349 ; GCN-NEXT: v_trunc_f32_e32 v1, v0
26350 ; GCN-NEXT: v_sub_f32_e32 v2, v0, v1
26351 ; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26352 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26353 ; GCN-NEXT: s_brev_b32 s4, -2
26354 ; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0
26355 ; GCN-NEXT: v_add_f32_e32 v0, v1, v0
26356 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26357 ; GCN-NEXT: s_setpc_b64 s[30:31]
26359 ; GFX7-LABEL: v_round_bf16:
26361 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26362 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26363 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26364 ; GFX7-NEXT: v_trunc_f32_e32 v1, v0
26365 ; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1
26366 ; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26367 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26368 ; GFX7-NEXT: s_brev_b32 s4, -2
26369 ; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0
26370 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
26371 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26372 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26374 ; GFX8-LABEL: v_round_bf16:
26376 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26377 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26378 ; GFX8-NEXT: v_trunc_f32_e32 v1, v0
26379 ; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
26380 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26381 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26382 ; GFX8-NEXT: s_brev_b32 s4, -2
26383 ; GFX8-NEXT: v_bfi_b32 v0, s4, v2, v0
26384 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
26385 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26386 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26387 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26388 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26389 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26390 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26391 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26392 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26394 ; GFX9-LABEL: v_round_bf16:
26396 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26397 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26398 ; GFX9-NEXT: v_trunc_f32_e32 v1, v0
26399 ; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1
26400 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
26401 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
26402 ; GFX9-NEXT: s_brev_b32 s4, -2
26403 ; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
26404 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
26405 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26406 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26407 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26408 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26409 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26410 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26411 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26412 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26414 ; GFX10-LABEL: v_round_bf16:
26416 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26417 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26418 ; GFX10-NEXT: v_trunc_f32_e32 v1, v0
26419 ; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1
26420 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
26421 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
26422 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
26423 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
26424 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26425 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26426 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26427 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26428 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26429 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26430 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26432 ; GFX11-LABEL: v_round_bf16:
26434 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26435 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26436 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26437 ; GFX11-NEXT: v_trunc_f32_e32 v1, v0
26438 ; GFX11-NEXT: v_sub_f32_e32 v2, v0, v1
26439 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26440 ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
26441 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
26442 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26443 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
26444 ; GFX11-NEXT: v_add_f32_e32 v0, v1, v0
26445 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
26446 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26447 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26448 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26449 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26450 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26451 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26452 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26453 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26454 %op = call bfloat @llvm.round.bf16(bfloat %a)
26458 declare bfloat @llvm.roundeven.bf16(bfloat)
26460 define bfloat @v_roundeven_bf16(bfloat %a) {
26461 ; GCN-LABEL: v_roundeven_bf16:
26463 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26464 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26465 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26466 ; GCN-NEXT: v_rndne_f32_e32 v0, v0
26467 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26468 ; GCN-NEXT: s_setpc_b64 s[30:31]
26470 ; GFX7-LABEL: v_roundeven_bf16:
26472 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26473 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26474 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26475 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0
26476 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26477 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26479 ; GFX8-LABEL: v_roundeven_bf16:
26481 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26482 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26483 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0
26484 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26485 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26486 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26487 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26488 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26489 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26490 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26491 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26493 ; GFX9-LABEL: v_roundeven_bf16:
26495 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26496 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26497 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0
26498 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26499 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26500 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26501 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26502 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26503 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26504 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26505 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26507 ; GFX10-LABEL: v_roundeven_bf16:
26509 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26510 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26511 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0
26512 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26513 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26514 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26515 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26516 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26517 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26518 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26520 ; GFX11-LABEL: v_roundeven_bf16:
26522 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26523 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26524 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26525 ; GFX11-NEXT: v_rndne_f32_e32 v0, v0
26526 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26527 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26528 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26529 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26530 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26531 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26532 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26533 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26534 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26535 %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
26539 declare bfloat @llvm.floor.bf16(bfloat)
26541 define bfloat @v_floor_bf16(bfloat %a) {
26542 ; GCN-LABEL: v_floor_bf16:
26544 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26545 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26546 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26547 ; GCN-NEXT: v_floor_f32_e32 v0, v0
26548 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26549 ; GCN-NEXT: s_setpc_b64 s[30:31]
26551 ; GFX7-LABEL: v_floor_bf16:
26553 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26554 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26555 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26556 ; GFX7-NEXT: v_floor_f32_e32 v0, v0
26557 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26558 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26560 ; GFX8-LABEL: v_floor_bf16:
26562 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26563 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26564 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
26565 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26566 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26567 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26568 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26569 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26570 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26571 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26572 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26574 ; GFX9-LABEL: v_floor_bf16:
26576 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26577 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26578 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
26579 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26580 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26581 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26582 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26583 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26584 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26585 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26586 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26588 ; GFX10-LABEL: v_floor_bf16:
26590 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26591 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26592 ; GFX10-NEXT: v_floor_f32_e32 v0, v0
26593 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26594 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26595 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26596 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26597 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26598 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26599 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26601 ; GFX11-LABEL: v_floor_bf16:
26603 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26604 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26605 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26606 ; GFX11-NEXT: v_floor_f32_e32 v0, v0
26607 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26608 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26609 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26610 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26611 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26612 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26613 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26614 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26615 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26616 %op = call bfloat @llvm.floor.bf16(bfloat %a)
26620 declare bfloat @llvm.canonicalize.bf16(bfloat)
26622 define bfloat @v_canonicalize_bf16(bfloat %a) {
26623 ; GCN-LABEL: v_canonicalize_bf16:
26625 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26626 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26627 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26628 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26629 ; GCN-NEXT: s_setpc_b64 s[30:31]
26631 ; GFX7-LABEL: v_canonicalize_bf16:
26633 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26634 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26635 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26636 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26637 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26639 ; GFX8-LABEL: v_canonicalize_bf16:
26641 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26642 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26643 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
26644 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
26645 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
26646 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
26647 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
26648 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26649 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26650 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26651 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26653 ; GFX9-LABEL: v_canonicalize_bf16:
26655 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26656 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26657 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
26658 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
26659 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
26660 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
26661 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
26662 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
26663 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
26664 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26665 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26667 ; GFX10-LABEL: v_canonicalize_bf16:
26669 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26670 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26671 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
26672 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
26673 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
26674 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26675 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26676 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26677 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26678 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26680 ; GFX11-LABEL: v_canonicalize_bf16:
26682 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26683 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26684 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26685 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
26686 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
26687 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
26688 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
26689 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
26690 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
26691 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
26692 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26693 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
26694 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26695 %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
26699 declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
26701 ; FIXME: Promotion broken
26702 ; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
26703 ; %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
26707 define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
26708 ; GCN-LABEL: v_fcmp_false_bf16:
26710 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26711 ; GCN-NEXT: v_mov_b32_e32 v0, 0
26712 ; GCN-NEXT: s_setpc_b64 s[30:31]
26714 ; GFX7-LABEL: v_fcmp_false_bf16:
26716 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26717 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
26718 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26720 ; GFX8-LABEL: v_fcmp_false_bf16:
26722 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26723 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
26724 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26726 ; GFX9-LABEL: v_fcmp_false_bf16:
26728 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26729 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
26730 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26732 ; GFX10-LABEL: v_fcmp_false_bf16:
26734 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26735 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
26736 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26738 ; GFX11-LABEL: v_fcmp_false_bf16:
26740 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26741 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
26742 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26743 %op = fcmp false bfloat %a, %b
26747 define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
26748 ; GCN-LABEL: v_fcmp_oeq_bf16:
26750 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26751 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26752 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26753 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26754 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26755 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26756 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26757 ; GCN-NEXT: s_setpc_b64 s[30:31]
26759 ; GFX7-LABEL: v_fcmp_oeq_bf16:
26761 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26762 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26763 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26764 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26765 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26766 ; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26767 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26768 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26770 ; GFX8-LABEL: v_fcmp_oeq_bf16:
26772 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26773 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26774 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26775 ; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26776 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26777 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26779 ; GFX9-LABEL: v_fcmp_oeq_bf16:
26781 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26782 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26783 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26784 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
26785 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26786 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26788 ; GFX10-LABEL: v_fcmp_oeq_bf16:
26790 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26791 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26792 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26793 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
26794 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26795 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26797 ; GFX11-LABEL: v_fcmp_oeq_bf16:
26799 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26800 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26801 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26802 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26803 ; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
26804 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26805 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26806 %op = fcmp oeq bfloat %a, %b
26810 define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
26811 ; GCN-LABEL: v_fcmp_ogt_bf16:
26813 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26814 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26815 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26816 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26817 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26818 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26819 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26820 ; GCN-NEXT: s_setpc_b64 s[30:31]
26822 ; GFX7-LABEL: v_fcmp_ogt_bf16:
26824 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26825 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26826 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26827 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26828 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26829 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26830 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26831 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26833 ; GFX8-LABEL: v_fcmp_ogt_bf16:
26835 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26836 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26837 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26838 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26839 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26840 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26842 ; GFX9-LABEL: v_fcmp_ogt_bf16:
26844 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26845 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26846 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26847 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
26848 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26849 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26851 ; GFX10-LABEL: v_fcmp_ogt_bf16:
26853 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26854 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26855 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26856 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
26857 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26858 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26860 ; GFX11-LABEL: v_fcmp_ogt_bf16:
26862 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26863 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26864 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26865 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26866 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
26867 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26868 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26869 %op = fcmp ogt bfloat %a, %b
26873 define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
26874 ; GCN-LABEL: v_fcmp_oge_bf16:
26876 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26877 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26878 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26879 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26880 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26881 ; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26882 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26883 ; GCN-NEXT: s_setpc_b64 s[30:31]
26885 ; GFX7-LABEL: v_fcmp_oge_bf16:
26887 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26888 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26889 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26890 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26891 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26892 ; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26893 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26894 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26896 ; GFX8-LABEL: v_fcmp_oge_bf16:
26898 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26899 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26900 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26901 ; GFX8-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26902 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26903 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26905 ; GFX9-LABEL: v_fcmp_oge_bf16:
26907 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26908 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26909 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26910 ; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
26911 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26912 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26914 ; GFX10-LABEL: v_fcmp_oge_bf16:
26916 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26917 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26918 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26919 ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
26920 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26921 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26923 ; GFX11-LABEL: v_fcmp_oge_bf16:
26925 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26926 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26927 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26929 ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
26930 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26931 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26932 %op = fcmp oge bfloat %a, %b
26936 define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
26937 ; GCN-LABEL: v_fcmp_olt_bf16:
26939 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26940 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26941 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
26942 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26943 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26944 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26945 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26946 ; GCN-NEXT: s_setpc_b64 s[30:31]
26948 ; GFX7-LABEL: v_fcmp_olt_bf16:
26950 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26951 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26952 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
26953 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
26954 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
26955 ; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26956 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26957 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26959 ; GFX8-LABEL: v_fcmp_olt_bf16:
26961 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26962 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26963 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26964 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26965 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26966 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26968 ; GFX9-LABEL: v_fcmp_olt_bf16:
26970 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26971 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26972 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26973 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
26974 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
26975 ; GFX9-NEXT: s_setpc_b64 s[30:31]
26977 ; GFX10-LABEL: v_fcmp_olt_bf16:
26979 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26980 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26981 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26982 ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
26983 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26984 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26986 ; GFX11-LABEL: v_fcmp_olt_bf16:
26988 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26989 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
26990 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
26991 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
26992 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
26993 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
26994 ; GFX11-NEXT: s_setpc_b64 s[30:31]
26995 %op = fcmp olt bfloat %a, %b
26999 define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
27000 ; GCN-LABEL: v_fcmp_ole_bf16:
27002 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27003 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27004 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27005 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27006 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27007 ; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27008 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27009 ; GCN-NEXT: s_setpc_b64 s[30:31]
27011 ; GFX7-LABEL: v_fcmp_ole_bf16:
27013 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27014 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27015 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27016 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27017 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27018 ; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27019 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27020 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27022 ; GFX8-LABEL: v_fcmp_ole_bf16:
27024 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27025 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27026 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27027 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27028 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27029 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27031 ; GFX9-LABEL: v_fcmp_ole_bf16:
27033 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27034 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27035 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27036 ; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
27037 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27038 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27040 ; GFX10-LABEL: v_fcmp_ole_bf16:
27042 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27043 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27044 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27045 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
27046 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27047 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27049 ; GFX11-LABEL: v_fcmp_ole_bf16:
27051 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27052 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27053 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27054 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27055 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
27056 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27057 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27058 %op = fcmp ole bfloat %a, %b
27062 define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
27063 ; GCN-LABEL: v_fcmp_one_bf16:
27065 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27066 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27067 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27068 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27069 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27070 ; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27071 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27072 ; GCN-NEXT: s_setpc_b64 s[30:31]
27074 ; GFX7-LABEL: v_fcmp_one_bf16:
27076 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27077 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27078 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27079 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27080 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27081 ; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27082 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27083 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27085 ; GFX8-LABEL: v_fcmp_one_bf16:
27087 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27088 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27089 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27090 ; GFX8-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27091 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27092 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27094 ; GFX9-LABEL: v_fcmp_one_bf16:
27096 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27097 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27098 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27099 ; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
27100 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27101 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27103 ; GFX10-LABEL: v_fcmp_one_bf16:
27105 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27106 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27107 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27108 ; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
27109 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27110 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27112 ; GFX11-LABEL: v_fcmp_one_bf16:
27114 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27115 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27116 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27117 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27118 ; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
27119 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27120 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27121 %op = fcmp one bfloat %a, %b
27125 define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
27126 ; GCN-LABEL: v_fcmp_uno_bf16:
27128 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27129 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27130 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27131 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27132 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27133 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27134 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27135 ; GCN-NEXT: s_setpc_b64 s[30:31]
27137 ; GFX7-LABEL: v_fcmp_uno_bf16:
27139 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27140 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27141 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27142 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27143 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27144 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27145 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27146 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27148 ; GFX8-LABEL: v_fcmp_uno_bf16:
27150 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27151 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27152 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27153 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27154 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27155 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27157 ; GFX9-LABEL: v_fcmp_uno_bf16:
27159 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27160 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27161 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27162 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
27163 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27164 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27166 ; GFX10-LABEL: v_fcmp_uno_bf16:
27168 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27169 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27170 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27171 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
27172 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27173 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27175 ; GFX11-LABEL: v_fcmp_uno_bf16:
27177 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27178 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27179 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27180 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27181 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
27182 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27183 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27184 %op = fcmp uno bfloat %a, %b
27188 define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
27189 ; GCN-LABEL: v_fcmp_ueq_bf16:
27191 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27192 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27193 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27194 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27195 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27196 ; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27197 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27198 ; GCN-NEXT: s_setpc_b64 s[30:31]
27200 ; GFX7-LABEL: v_fcmp_ueq_bf16:
27202 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27203 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27204 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27205 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27206 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27207 ; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27208 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27209 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27211 ; GFX8-LABEL: v_fcmp_ueq_bf16:
27213 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27214 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27215 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27216 ; GFX8-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27217 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27218 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27220 ; GFX9-LABEL: v_fcmp_ueq_bf16:
27222 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27223 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27224 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27225 ; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
27226 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27227 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27229 ; GFX10-LABEL: v_fcmp_ueq_bf16:
27231 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27232 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27233 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27234 ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
27235 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27236 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27238 ; GFX11-LABEL: v_fcmp_ueq_bf16:
27240 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27241 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27242 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27243 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27244 ; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
27245 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27246 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27247 %op = fcmp ueq bfloat %a, %b
27251 define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
27252 ; GCN-LABEL: v_fcmp_ugt_bf16:
27254 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27255 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27256 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27257 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27258 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27259 ; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27260 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27261 ; GCN-NEXT: s_setpc_b64 s[30:31]
27263 ; GFX7-LABEL: v_fcmp_ugt_bf16:
27265 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27266 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27267 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27268 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27269 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27270 ; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27271 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27272 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27274 ; GFX8-LABEL: v_fcmp_ugt_bf16:
27276 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27277 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27278 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27279 ; GFX8-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27280 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27281 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27283 ; GFX9-LABEL: v_fcmp_ugt_bf16:
27285 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27286 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27287 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27288 ; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
27289 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27290 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27292 ; GFX10-LABEL: v_fcmp_ugt_bf16:
27294 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27295 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27296 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27297 ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
27298 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27299 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27301 ; GFX11-LABEL: v_fcmp_ugt_bf16:
27303 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27304 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27305 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27306 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27307 ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
27308 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27309 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27310 %op = fcmp ugt bfloat %a, %b
27314 define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
27315 ; GCN-LABEL: v_fcmp_uge_bf16:
27317 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27318 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27319 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27320 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27321 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27322 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27323 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27324 ; GCN-NEXT: s_setpc_b64 s[30:31]
27326 ; GFX7-LABEL: v_fcmp_uge_bf16:
27328 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27329 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27330 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27331 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27332 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27333 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27334 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27335 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27337 ; GFX8-LABEL: v_fcmp_uge_bf16:
27339 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27340 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27341 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27342 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27343 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27344 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27346 ; GFX9-LABEL: v_fcmp_uge_bf16:
27348 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27349 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27350 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27351 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
27352 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27353 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27355 ; GFX10-LABEL: v_fcmp_uge_bf16:
27357 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27358 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27359 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27360 ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
27361 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27362 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27364 ; GFX11-LABEL: v_fcmp_uge_bf16:
27366 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27367 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27368 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27369 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27370 ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
27371 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27372 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27373 %op = fcmp uge bfloat %a, %b
27377 define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
27378 ; GCN-LABEL: v_fcmp_ult_bf16:
27380 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27381 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27382 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27383 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27384 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27385 ; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27386 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27387 ; GCN-NEXT: s_setpc_b64 s[30:31]
27389 ; GFX7-LABEL: v_fcmp_ult_bf16:
27391 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27392 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27393 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27394 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27395 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27396 ; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27397 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27398 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27400 ; GFX8-LABEL: v_fcmp_ult_bf16:
27402 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27403 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27404 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27405 ; GFX8-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27406 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27407 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27409 ; GFX9-LABEL: v_fcmp_ult_bf16:
27411 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27412 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27413 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27414 ; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
27415 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27416 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27418 ; GFX10-LABEL: v_fcmp_ult_bf16:
27420 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27421 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27422 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27423 ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
27424 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27425 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27427 ; GFX11-LABEL: v_fcmp_ult_bf16:
27429 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27430 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27431 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27432 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27433 ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
27434 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27435 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27436 %op = fcmp ult bfloat %a, %b
27440 define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
27441 ; GCN-LABEL: v_fcmp_ule_bf16:
27443 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27444 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27445 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27446 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27447 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27448 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27449 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27450 ; GCN-NEXT: s_setpc_b64 s[30:31]
27452 ; GFX7-LABEL: v_fcmp_ule_bf16:
27454 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27455 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27456 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27457 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27458 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27459 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27460 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27461 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27463 ; GFX8-LABEL: v_fcmp_ule_bf16:
27465 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27466 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27467 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27468 ; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27469 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27470 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27472 ; GFX9-LABEL: v_fcmp_ule_bf16:
27474 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27475 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27476 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27477 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
27478 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27479 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27481 ; GFX10-LABEL: v_fcmp_ule_bf16:
27483 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27484 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27485 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27486 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
27487 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27488 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27490 ; GFX11-LABEL: v_fcmp_ule_bf16:
27492 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27493 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27494 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27495 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27496 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
27497 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27498 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27499 %op = fcmp ule bfloat %a, %b
27503 define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
27504 ; GCN-LABEL: v_fcmp_une_bf16:
27506 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27507 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27508 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
27509 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27510 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27511 ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27512 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27513 ; GCN-NEXT: s_setpc_b64 s[30:31]
27515 ; GFX7-LABEL: v_fcmp_une_bf16:
27517 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27518 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27519 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
27520 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
27521 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
27522 ; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27523 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27524 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27526 ; GFX8-LABEL: v_fcmp_une_bf16:
27528 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27529 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27530 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27531 ; GFX8-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27532 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27533 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27535 ; GFX9-LABEL: v_fcmp_une_bf16:
27537 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27538 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27539 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27540 ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
27541 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
27542 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27544 ; GFX10-LABEL: v_fcmp_une_bf16:
27546 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27547 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27548 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27549 ; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
27550 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27551 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27553 ; GFX11-LABEL: v_fcmp_une_bf16:
27555 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27556 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
27557 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27558 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27559 ; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
27560 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
27561 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27562 %op = fcmp une bfloat %a, %b
27566 define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
27567 ; GCN-LABEL: v_fcmp_true_bf16:
27569 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27570 ; GCN-NEXT: v_mov_b32_e32 v0, 1
27571 ; GCN-NEXT: s_setpc_b64 s[30:31]
27573 ; GFX7-LABEL: v_fcmp_true_bf16:
27575 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27576 ; GFX7-NEXT: v_mov_b32_e32 v0, 1
27577 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27579 ; GFX8-LABEL: v_fcmp_true_bf16:
27581 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27582 ; GFX8-NEXT: v_mov_b32_e32 v0, 1
27583 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27585 ; GFX9-LABEL: v_fcmp_true_bf16:
27587 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27588 ; GFX9-NEXT: v_mov_b32_e32 v0, 1
27589 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27591 ; GFX10-LABEL: v_fcmp_true_bf16:
27593 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27594 ; GFX10-NEXT: v_mov_b32_e32 v0, 1
27595 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27597 ; GFX11-LABEL: v_fcmp_true_bf16:
27599 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27600 ; GFX11-NEXT: v_mov_b32_e32 v0, 1
27601 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27602 %op = fcmp true bfloat %a, %b
27606 declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
27608 define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
27609 ; GCN-LABEL: v_copysign_bf16_bf16:
27611 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27612 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27613 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27614 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27615 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27616 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27617 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27618 ; GCN-NEXT: s_setpc_b64 s[30:31]
27620 ; GFX7-LABEL: v_copysign_bf16_bf16:
27622 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27623 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27624 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27625 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27626 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27627 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27628 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27629 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27631 ; GFX8-LABEL: v_copysign_bf16_bf16:
27633 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27634 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27635 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27636 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27638 ; GFX9-LABEL: v_copysign_bf16_bf16:
27640 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27641 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27642 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27643 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27645 ; GFX10-LABEL: v_copysign_bf16_bf16:
27647 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27648 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27649 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27651 ; GFX11-LABEL: v_copysign_bf16_bf16:
27653 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27654 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27655 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27656 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27660 define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
27661 ; GCN-LABEL: v_copysign_bf16_s_bf16:
27663 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27664 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27665 ; GCN-NEXT: s_and_b32 s4, s16, 0x80000000
27666 ; GCN-NEXT: s_lshr_b32 s4, s4, 16
27667 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27668 ; GCN-NEXT: v_or_b32_e32 v0, s4, v0
27669 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27670 ; GCN-NEXT: s_setpc_b64 s[30:31]
27672 ; GFX7-LABEL: v_copysign_bf16_s_bf16:
27674 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27675 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27676 ; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000
27677 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16
27678 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27679 ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0
27680 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27681 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27683 ; GFX8-LABEL: v_copysign_bf16_s_bf16:
27685 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27686 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27687 ; GFX8-NEXT: v_mov_b32_e32 v1, s16
27688 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27689 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27691 ; GFX9-LABEL: v_copysign_bf16_s_bf16:
27693 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27694 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27695 ; GFX9-NEXT: v_mov_b32_e32 v1, s16
27696 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27697 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27699 ; GFX10-LABEL: v_copysign_bf16_s_bf16:
27701 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27702 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s16
27703 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27705 ; GFX11-LABEL: v_copysign_bf16_s_bf16:
27707 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27708 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
27709 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27710 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27714 define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
27715 ; GCN-LABEL: v_copysign_s_bf16_bf16:
27717 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27718 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16
27719 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
27720 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
27721 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
27722 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
27723 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27724 ; GCN-NEXT: s_setpc_b64 s[30:31]
27726 ; GFX7-LABEL: v_copysign_s_bf16_bf16:
27728 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27729 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16
27730 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
27731 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
27732 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
27733 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
27734 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27735 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27737 ; GFX8-LABEL: v_copysign_s_bf16_bf16:
27739 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27740 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27741 ; GFX8-NEXT: v_mov_b32_e32 v1, s16
27742 ; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0
27743 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27745 ; GFX9-LABEL: v_copysign_s_bf16_bf16:
27747 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27748 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27749 ; GFX9-NEXT: v_mov_b32_e32 v1, s16
27750 ; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
27751 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27753 ; GFX10-LABEL: v_copysign_s_bf16_bf16:
27755 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27756 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s16, v0
27757 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27759 ; GFX11-LABEL: v_copysign_s_bf16_bf16:
27761 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27762 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
27763 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27764 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27768 define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
27769 ; GCN-LABEL: v_copysign_bf16_f32:
27771 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27772 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27773 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27774 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27775 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27776 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27777 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27778 ; GCN-NEXT: s_setpc_b64 s[30:31]
27780 ; GFX7-LABEL: v_copysign_bf16_f32:
27782 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27783 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27784 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
27785 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27786 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27787 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27788 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27789 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27791 ; GFX8-LABEL: v_copysign_bf16_f32:
27793 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27794 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27795 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27796 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27797 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27799 ; GFX9-LABEL: v_copysign_bf16_f32:
27801 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27802 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27803 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27804 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27805 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27807 ; GFX10-LABEL: v_copysign_bf16_f32:
27809 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27810 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27811 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27812 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27814 ; GFX11-LABEL: v_copysign_bf16_f32:
27816 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27817 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27818 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27819 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27820 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27821 %sign = fptrunc float %sign.f32 to bfloat
27822 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27826 define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
27827 ; GCN-LABEL: v_copysign_bf16_f64:
27829 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27830 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27831 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2
27832 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27833 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27834 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27835 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27836 ; GCN-NEXT: s_setpc_b64 s[30:31]
27838 ; GFX7-LABEL: v_copysign_bf16_f64:
27840 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27841 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27842 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2
27843 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
27844 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27845 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27846 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27847 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27849 ; GFX8-LABEL: v_copysign_bf16_f64:
27851 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27852 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27853 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27854 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27855 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27857 ; GFX9-LABEL: v_copysign_bf16_f64:
27859 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27860 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27861 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27862 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27863 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27865 ; GFX10-LABEL: v_copysign_bf16_f64:
27867 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27868 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27869 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27870 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27872 ; GFX11-LABEL: v_copysign_bf16_f64:
27874 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27875 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
27876 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27877 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27878 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27879 %sign = fptrunc double %sign.f64 to bfloat
27880 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27884 define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
27885 ; GCN-LABEL: v_copysign_bf16_f16:
27887 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27888 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
27889 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
27890 ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1
27891 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27892 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
27893 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27894 ; GCN-NEXT: s_setpc_b64 s[30:31]
27896 ; GFX7-LABEL: v_copysign_bf16_f16:
27898 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27899 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
27900 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
27901 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27902 ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1
27903 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
27904 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27905 ; GFX7-NEXT: s_setpc_b64 s[30:31]
27907 ; GFX8-LABEL: v_copysign_bf16_f16:
27909 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27910 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
27911 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
27912 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27914 ; GFX9-LABEL: v_copysign_bf16_f16:
27916 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27917 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
27918 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
27919 ; GFX9-NEXT: s_setpc_b64 s[30:31]
27921 ; GFX10-LABEL: v_copysign_bf16_f16:
27923 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27924 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27925 ; GFX10-NEXT: s_setpc_b64 s[30:31]
27927 ; GFX11-LABEL: v_copysign_bf16_f16:
27929 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27930 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
27931 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27932 %sign = bitcast half %sign.f16 to bfloat
27933 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27937 define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
27938 ; GCN-LABEL: s_copysign_bf16_bf16:
27940 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
27941 ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
27942 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
27943 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
27944 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0
27945 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
27946 ; GCN-NEXT: ; return to shader part epilog
27948 ; GFX7-LABEL: s_copysign_bf16_bf16:
27950 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
27951 ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
27952 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
27953 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
27954 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
27955 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
27956 ; GFX7-NEXT: ; return to shader part epilog
27958 ; GFX8-LABEL: s_copysign_bf16_bf16:
27960 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff
27961 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
27962 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
27963 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
27964 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
27965 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
27966 ; GFX8-NEXT: ; return to shader part epilog
27968 ; GFX9-LABEL: s_copysign_bf16_bf16:
27970 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
27971 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
27972 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
27973 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
27974 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
27975 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
27976 ; GFX9-NEXT: ; return to shader part epilog
27978 ; GFX10-LABEL: s_copysign_bf16_bf16:
27980 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
27981 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
27982 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
27983 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
27984 ; GFX10-NEXT: ; return to shader part epilog
27986 ; GFX11-LABEL: s_copysign_bf16_bf16:
27988 ; GFX11-NEXT: v_mov_b32_e32 v0, s1
27989 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
27990 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
27991 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
27992 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
27993 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
27994 ; GFX11-NEXT: ; return to shader part epilog
27995 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
27996 %cast = bitcast bfloat %op to i16
27997 %zext = zext i16 %cast to i32
27998 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28002 define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
28003 ; GCN-LABEL: s_copysign_bf16_f32:
28005 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
28006 ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
28007 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
28008 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
28009 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0
28010 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28011 ; GCN-NEXT: ; return to shader part epilog
28013 ; GFX7-LABEL: s_copysign_bf16_f32:
28015 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
28016 ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
28017 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
28018 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
28019 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
28020 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28021 ; GFX7-NEXT: ; return to shader part epilog
28023 ; GFX8-LABEL: s_copysign_bf16_f32:
28025 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28026 ; GFX8-NEXT: s_movk_i32 s1, 0x7fff
28027 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
28028 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
28029 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28030 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28031 ; GFX8-NEXT: ; return to shader part epilog
28033 ; GFX9-LABEL: s_copysign_bf16_f32:
28035 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28036 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff
28037 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28038 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
28039 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28040 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28041 ; GFX9-NEXT: ; return to shader part epilog
28043 ; GFX10-LABEL: s_copysign_bf16_f32:
28045 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28046 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28047 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28048 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28049 ; GFX10-NEXT: ; return to shader part epilog
28051 ; GFX11-LABEL: s_copysign_bf16_f32:
28053 ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1
28054 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28055 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28056 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28057 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28058 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28059 ; GFX11-NEXT: ; return to shader part epilog
28060 %sign = fptrunc float %sign.f32 to bfloat
28061 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28062 %cast = bitcast bfloat %op to i16
28063 %zext = zext i16 %cast to i32
28064 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28068 define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
28069 ; GCN-LABEL: s_copysign_bf16_f64:
28071 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
28072 ; GCN-NEXT: s_and_b32 s0, s2, 0x80000000
28073 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
28074 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
28075 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0
28076 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28077 ; GCN-NEXT: ; return to shader part epilog
28079 ; GFX7-LABEL: s_copysign_bf16_f64:
28081 ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
28082 ; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000
28083 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16
28084 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
28085 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
28086 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28087 ; GFX7-NEXT: ; return to shader part epilog
28089 ; GFX8-LABEL: s_copysign_bf16_f64:
28091 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28092 ; GFX8-NEXT: s_movk_i32 s1, 0x7fff
28093 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
28094 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
28095 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28096 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28097 ; GFX8-NEXT: ; return to shader part epilog
28099 ; GFX9-LABEL: s_copysign_bf16_f64:
28101 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28102 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff
28103 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28104 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
28105 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28106 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28107 ; GFX9-NEXT: ; return to shader part epilog
28109 ; GFX10-LABEL: s_copysign_bf16_f64:
28111 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28112 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28113 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28114 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28115 ; GFX10-NEXT: ; return to shader part epilog
28117 ; GFX11-LABEL: s_copysign_bf16_f64:
28119 ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2
28120 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28121 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28122 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28123 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28124 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28125 ; GFX11-NEXT: ; return to shader part epilog
28126 %sign = fptrunc double %sign.f64 to bfloat
28127 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28128 %cast = bitcast bfloat %op to i16
28129 %zext = zext i16 %cast to i32
28130 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28134 define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
28135 ; GCN-LABEL: s_copysign_bf16_f16:
28137 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
28138 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1
28139 ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1
28140 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
28141 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
28142 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28143 ; GCN-NEXT: ; return to shader part epilog
28145 ; GFX7-LABEL: s_copysign_bf16_f16:
28147 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1
28148 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
28149 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
28150 ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
28151 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
28152 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28153 ; GFX7-NEXT: ; return to shader part epilog
28155 ; GFX8-LABEL: s_copysign_bf16_f16:
28157 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff
28158 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
28159 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
28160 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
28161 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28162 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28163 ; GFX8-NEXT: ; return to shader part epilog
28165 ; GFX9-LABEL: s_copysign_bf16_f16:
28167 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
28168 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
28169 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
28170 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
28171 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28172 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28173 ; GFX9-NEXT: ; return to shader part epilog
28175 ; GFX10-LABEL: s_copysign_bf16_f16:
28177 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
28178 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28179 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28180 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28181 ; GFX10-NEXT: ; return to shader part epilog
28183 ; GFX11-LABEL: s_copysign_bf16_f16:
28185 ; GFX11-NEXT: v_mov_b32_e32 v0, s1
28186 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28187 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28188 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28189 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28190 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28191 ; GFX11-NEXT: ; return to shader part epilog
28192 %sign = bitcast half %sign.f16 to bfloat
28193 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
28194 %cast = bitcast bfloat %op to i16
28195 %zext = zext i16 %cast to i32
28196 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28200 declare float @llvm.copysign.f32(float, float)
28202 define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
28203 ; GCN-LABEL: v_copysign_f32_bf16:
28205 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28206 ; GCN-NEXT: s_brev_b32 s4, -2
28207 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
28208 ; GCN-NEXT: s_setpc_b64 s[30:31]
28210 ; GFX7-LABEL: v_copysign_f32_bf16:
28212 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28213 ; GFX7-NEXT: s_brev_b32 s4, -2
28214 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
28215 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28217 ; GFX8-LABEL: v_copysign_f32_bf16:
28219 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28220 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28221 ; GFX8-NEXT: s_brev_b32 s4, -2
28222 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
28223 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28225 ; GFX9-LABEL: v_copysign_f32_bf16:
28227 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28228 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28229 ; GFX9-NEXT: s_brev_b32 s4, -2
28230 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
28231 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28233 ; GFX10-LABEL: v_copysign_f32_bf16:
28235 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28236 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28237 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
28238 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28240 ; GFX11-LABEL: v_copysign_f32_bf16:
28242 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28243 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28244 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28245 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
28246 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28247 %sign = fpext bfloat %sign.bf16 to float
28248 %op = call float @llvm.copysign.f32(float %mag, float %sign)
28252 define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
28253 ; GCN-LABEL: s_copysign_f32_bf16:
28255 ; GCN-NEXT: s_brev_b32 s2, -2
28256 ; GCN-NEXT: v_mov_b32_e32 v0, s0
28257 ; GCN-NEXT: v_mov_b32_e32 v1, s1
28258 ; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1
28259 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28260 ; GCN-NEXT: ; return to shader part epilog
28262 ; GFX7-LABEL: s_copysign_f32_bf16:
28264 ; GFX7-NEXT: s_brev_b32 s2, -2
28265 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
28266 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
28267 ; GFX7-NEXT: v_bfi_b32 v0, s2, v0, v1
28268 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28269 ; GFX7-NEXT: ; return to shader part epilog
28271 ; GFX8-LABEL: s_copysign_f32_bf16:
28273 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28274 ; GFX8-NEXT: s_brev_b32 s1, -2
28275 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
28276 ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
28277 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28278 ; GFX8-NEXT: ; return to shader part epilog
28280 ; GFX9-LABEL: s_copysign_f32_bf16:
28282 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28283 ; GFX9-NEXT: s_brev_b32 s1, -2
28284 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28285 ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
28286 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28287 ; GFX9-NEXT: ; return to shader part epilog
28289 ; GFX10-LABEL: s_copysign_f32_bf16:
28291 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28292 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
28293 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28294 ; GFX10-NEXT: ; return to shader part epilog
28296 ; GFX11-LABEL: s_copysign_f32_bf16:
28298 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1
28299 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28300 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
28301 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28302 ; GFX11-NEXT: ; return to shader part epilog
28303 %sign = fpext bfloat %sign.bf16 to float
28304 %op = call float @llvm.copysign.f32(float %mag, float %sign)
28305 %cast = bitcast float %op to i32
28306 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
28310 declare half @llvm.copysign.f16(half, half)
28312 define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
28313 ; GCN-LABEL: v_copysign_f16_bf16:
28315 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28316 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28317 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
28318 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28319 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
28320 ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
28321 ; GCN-NEXT: s_brev_b32 s4, -2
28322 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
28323 ; GCN-NEXT: s_setpc_b64 s[30:31]
28325 ; GFX7-LABEL: v_copysign_f16_bf16:
28327 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28328 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
28329 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28330 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28331 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
28332 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
28333 ; GFX7-NEXT: s_brev_b32 s4, -2
28334 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
28335 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28337 ; GFX8-LABEL: v_copysign_f16_bf16:
28339 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28340 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
28341 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
28342 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28344 ; GFX9-LABEL: v_copysign_f16_bf16:
28346 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28347 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
28348 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
28349 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28351 ; GFX10-LABEL: v_copysign_f16_bf16:
28353 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28354 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
28355 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28357 ; GFX11-LABEL: v_copysign_f16_bf16:
28359 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28360 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
28361 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28362 %sign = bitcast bfloat %sign.bf16 to half
28363 %op = call half @llvm.copysign.f16(half %mag, half %sign)
28367 define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
28368 ; GCN-LABEL: s_copysign_f16_bf16:
28370 ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
28371 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0
28372 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
28373 ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
28374 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
28375 ; GCN-NEXT: s_brev_b32 s0, -2
28376 ; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0
28377 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
28378 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
28379 ; GCN-NEXT: ; return to shader part epilog
28381 ; GFX7-LABEL: s_copysign_f16_bf16:
28383 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0
28384 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
28385 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
28386 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
28387 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
28388 ; GFX7-NEXT: s_brev_b32 s0, -2
28389 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
28390 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
28391 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
28392 ; GFX7-NEXT: ; return to shader part epilog
28394 ; GFX8-LABEL: s_copysign_f16_bf16:
28396 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff
28397 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
28398 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
28399 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
28400 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
28401 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
28402 ; GFX8-NEXT: ; return to shader part epilog
28404 ; GFX9-LABEL: s_copysign_f16_bf16:
28406 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
28407 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
28408 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
28409 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
28410 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
28411 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
28412 ; GFX9-NEXT: ; return to shader part epilog
28414 ; GFX10-LABEL: s_copysign_f16_bf16:
28416 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
28417 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28418 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
28419 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
28420 ; GFX10-NEXT: ; return to shader part epilog
28422 ; GFX11-LABEL: s_copysign_f16_bf16:
28424 ; GFX11-NEXT: v_mov_b32_e32 v0, s1
28425 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28426 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
28427 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
28428 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28429 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
28430 ; GFX11-NEXT: ; return to shader part epilog
28431 %sign = bitcast bfloat %sign.bf16 to half
28432 %op = call half @llvm.copysign.f16(half %mag, half %sign)
28433 %cast = bitcast half %op to i16
28434 %zext = zext i16 %cast to i32
28435 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
28439 declare double @llvm.copysign.f64(double, double)
28441 define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
28442 ; GCN-LABEL: v_copysign_f64_bf16:
28444 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28445 ; GCN-NEXT: s_brev_b32 s4, -2
28446 ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2
28447 ; GCN-NEXT: s_setpc_b64 s[30:31]
28449 ; GFX7-LABEL: v_copysign_f64_bf16:
28451 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28452 ; GFX7-NEXT: s_brev_b32 s4, -2
28453 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
28454 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28456 ; GFX8-LABEL: v_copysign_f64_bf16:
28458 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28459 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28460 ; GFX8-NEXT: s_brev_b32 s4, -2
28461 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
28462 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28464 ; GFX9-LABEL: v_copysign_f64_bf16:
28466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28467 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28468 ; GFX9-NEXT: s_brev_b32 s4, -2
28469 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
28470 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28472 ; GFX10-LABEL: v_copysign_f64_bf16:
28474 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28475 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28476 ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
28477 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28479 ; GFX11-LABEL: v_copysign_f64_bf16:
28481 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28482 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
28483 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28484 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
28485 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28486 %sign = fpext bfloat %sign.bf16 to double
28487 %op = call double @llvm.copysign.f64(double %mag, double %sign)
28491 define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
28492 ; GCN-LABEL: s_copysign_f64_bf16:
28494 ; GCN-NEXT: s_brev_b32 s3, -2
28495 ; GCN-NEXT: v_mov_b32_e32 v0, s1
28496 ; GCN-NEXT: v_mov_b32_e32 v1, s2
28497 ; GCN-NEXT: v_bfi_b32 v0, s3, v0, v1
28498 ; GCN-NEXT: v_readfirstlane_b32 s1, v0
28499 ; GCN-NEXT: ; return to shader part epilog
28501 ; GFX7-LABEL: s_copysign_f64_bf16:
28503 ; GFX7-NEXT: s_brev_b32 s3, -2
28504 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
28505 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
28506 ; GFX7-NEXT: v_bfi_b32 v0, s3, v0, v1
28507 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0
28508 ; GFX7-NEXT: ; return to shader part epilog
28510 ; GFX8-LABEL: s_copysign_f64_bf16:
28512 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28513 ; GFX8-NEXT: s_brev_b32 s2, -2
28514 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
28515 ; GFX8-NEXT: v_bfi_b32 v0, s2, v1, v0
28516 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0
28517 ; GFX8-NEXT: ; return to shader part epilog
28519 ; GFX9-LABEL: s_copysign_f64_bf16:
28521 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28522 ; GFX9-NEXT: s_brev_b32 s2, -2
28523 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
28524 ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0
28525 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0
28526 ; GFX9-NEXT: ; return to shader part epilog
28528 ; GFX10-LABEL: s_copysign_f64_bf16:
28530 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28531 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
28532 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0
28533 ; GFX10-NEXT: ; return to shader part epilog
28535 ; GFX11-LABEL: s_copysign_f64_bf16:
28537 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2
28538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28539 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
28540 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
28541 ; GFX11-NEXT: ; return to shader part epilog
28542 %sign = fpext bfloat %sign.bf16 to double
28543 %op = call double @llvm.copysign.f64(double %mag, double %sign)
28544 %cast = bitcast double %op to <2 x i32>
28545 %cast.0 = extractelement <2 x i32> %cast, i32 0
28546 %cast.1 = extractelement <2 x i32> %cast, i32 1
28547 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
28548 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
28549 %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
28550 %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
28551 ret <2 x i32> %ins.1
28554 define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
28555 ; GCN-LABEL: v_fptosi_bf16_to_i16:
28557 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28558 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28559 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28560 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28561 ; GCN-NEXT: s_setpc_b64 s[30:31]
28563 ; GFX7-LABEL: v_fptosi_bf16_to_i16:
28565 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28566 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28567 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28568 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28569 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28571 ; GFX8-LABEL: v_fptosi_bf16_to_i16:
28573 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28574 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28575 ; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
28576 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28578 ; GFX9-LABEL: v_fptosi_bf16_to_i16:
28580 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28581 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28582 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28583 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28585 ; GFX10-LABEL: v_fptosi_bf16_to_i16:
28587 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28588 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28589 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28590 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28592 ; GFX11-LABEL: v_fptosi_bf16_to_i16:
28594 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28595 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28596 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28597 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
28598 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28599 %op = fptosi bfloat %x to i16
28603 define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
28604 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i16:
28606 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28607 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28608 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28609 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28610 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28611 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28612 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28613 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28614 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
28615 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2
28616 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
28617 ; GCN-NEXT: s_setpc_b64 s[30:31]
28619 ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16:
28621 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28622 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28623 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28624 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28625 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28626 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
28627 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28628 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28629 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
28630 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
28631 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
28632 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28634 ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16:
28636 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28637 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
28638 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28639 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28640 ; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
28641 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28642 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28644 ; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
28646 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28647 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
28648 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28649 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
28650 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28651 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
28652 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
28653 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28655 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
28657 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28658 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
28659 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28660 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
28661 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28662 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
28663 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28665 ; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
28666 ; GFX11TRUE16: ; %bb.0:
28667 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28668 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
28669 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28670 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28671 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28672 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28673 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
28674 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
28675 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
28677 ; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
28678 ; GFX11FAKE16: ; %bb.0:
28679 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28680 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
28681 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28682 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28683 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28684 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28685 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
28686 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
28687 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
28688 %op = fptosi <2 x bfloat> %x to <2 x i16>
28692 define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
28693 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i16:
28695 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28696 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
28697 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28698 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28699 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28700 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28701 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28702 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28703 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28704 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2
28705 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28706 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
28707 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3
28708 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
28709 ; GCN-NEXT: v_alignbit_b32 v1, v3, v1, 16
28710 ; GCN-NEXT: s_setpc_b64 s[30:31]
28712 ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16:
28714 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28715 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28716 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28717 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
28718 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28719 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28720 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
28721 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28722 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28723 ; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v2
28724 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28725 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
28726 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
28727 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3
28728 ; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16
28729 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28731 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
28733 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28734 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28735 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28736 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
28737 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28738 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28739 ; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1
28740 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28741 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28743 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
28745 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28746 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28747 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28748 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
28749 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28750 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28751 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
28752 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
28753 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
28754 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28756 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
28758 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28759 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28760 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28761 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28762 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
28763 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28764 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
28765 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
28766 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28768 ; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
28769 ; GFX11TRUE16: ; %bb.0:
28770 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28771 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
28772 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28773 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28774 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28775 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28776 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28777 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28778 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28779 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
28780 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
28782 ; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
28783 ; GFX11FAKE16: ; %bb.0:
28784 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28785 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
28786 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28787 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28788 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28789 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28790 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28791 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28792 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28793 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
28794 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
28795 %op = fptosi <3 x bfloat> %x to <3 x i16>
28799 define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
28800 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i16:
28802 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28803 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
28804 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
28805 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28806 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28807 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28808 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28809 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
28810 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28811 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28812 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28813 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
28814 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
28815 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28816 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
28817 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
28818 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
28819 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
28820 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4
28821 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
28822 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
28823 ; GCN-NEXT: s_setpc_b64 s[30:31]
28825 ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16:
28827 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28828 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
28829 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
28830 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
28831 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28832 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
28833 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
28834 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28835 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28836 ; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
28837 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
28838 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
28839 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28840 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
28841 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
28842 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28843 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
28844 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
28845 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
28846 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
28847 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
28848 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28850 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
28852 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28853 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28854 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28855 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28856 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28857 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
28858 ; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v3
28859 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28860 ; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
28861 ; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28862 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
28863 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28865 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
28867 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28868 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28869 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28870 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28871 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28872 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
28873 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
28874 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28875 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
28876 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
28877 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
28878 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
28879 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28881 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
28883 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28884 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28885 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28886 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28887 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28888 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
28889 ; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v3
28890 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28891 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
28892 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
28893 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
28894 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28896 ; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
28897 ; GFX11TRUE16: ; %bb.0:
28898 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28899 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
28900 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28901 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
28902 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
28903 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28904 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28905 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28906 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28907 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
28908 ; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28909 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28910 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
28911 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
28912 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
28914 ; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
28915 ; GFX11FAKE16: ; %bb.0:
28916 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28917 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
28918 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
28919 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28920 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28921 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28922 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
28923 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
28924 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28925 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
28926 ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
28927 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
28928 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
28929 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
28930 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
28931 %op = fptosi <4 x bfloat> %x to <4 x i16>
28935 define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
28936 ; GCN-LABEL: v_fptosi_bf16_to_i32:
28938 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28939 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28940 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28941 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28942 ; GCN-NEXT: s_setpc_b64 s[30:31]
28944 ; GFX7-LABEL: v_fptosi_bf16_to_i32:
28946 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28947 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
28948 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28949 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
28950 ; GFX7-NEXT: s_setpc_b64 s[30:31]
28952 ; GFX8-LABEL: v_fptosi_bf16_to_i32:
28954 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28955 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28956 ; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
28957 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28959 ; GFX9-LABEL: v_fptosi_bf16_to_i32:
28961 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28962 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28963 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
28964 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28966 ; GFX10-LABEL: v_fptosi_bf16_to_i32:
28968 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28969 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28970 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
28971 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28973 ; GFX11-LABEL: v_fptosi_bf16_to_i32:
28975 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28976 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28977 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
28978 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
28979 ; GFX11-NEXT: s_setpc_b64 s[30:31]
28980 %op = fptosi bfloat %x to i32
28984 define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
28985 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i32:
28987 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28988 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
28989 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
28990 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
28991 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
28992 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
28993 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
28994 ; GCN-NEXT: s_setpc_b64 s[30:31]
28996 ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32:
28998 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28999 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29000 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29001 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29002 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29003 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
29004 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
29005 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29007 ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32:
29009 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29010 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29011 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v1
29012 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29013 ; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v0
29014 ; GFX8-NEXT: v_mov_b32_e32 v0, v2
29015 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29017 ; GFX9-LABEL: v_fptosi_v2bf16_to_v2i32:
29019 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29020 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29021 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v1
29022 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29023 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v0
29024 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
29025 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29027 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32:
29029 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29030 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29031 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
29032 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v1
29033 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v2
29034 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29036 ; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32:
29038 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29039 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29040 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
29041 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29042 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1
29043 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2
29044 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29045 %op = fptosi <2 x bfloat> %x to <2 x i32>
29049 define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
29050 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i32:
29052 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29053 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29054 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29055 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29056 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29057 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29058 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29059 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
29060 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
29061 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
29062 ; GCN-NEXT: s_setpc_b64 s[30:31]
29064 ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32:
29066 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29067 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29068 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29069 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29070 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29071 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29072 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29073 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
29074 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
29075 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
29076 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29078 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32:
29080 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29081 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29082 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29083 ; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
29084 ; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
29085 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29086 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
29087 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
29088 ; GFX8-NEXT: v_mov_b32_e32 v1, v3
29089 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29091 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32:
29093 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29094 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29095 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29096 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
29097 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
29098 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29099 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
29100 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
29101 ; GFX9-NEXT: v_mov_b32_e32 v1, v3
29102 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29104 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32:
29106 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29107 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29108 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29109 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29110 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
29111 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
29112 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
29113 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29115 ; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32:
29117 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29118 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29119 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29120 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29121 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29122 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
29123 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
29124 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
29125 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
29126 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29127 %op = fptosi <3 x bfloat> %x to <3 x i32>
29131 define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
29132 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i32:
29134 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29135 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
29136 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29137 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29138 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29139 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29140 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29141 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29142 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
29143 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
29144 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
29145 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
29146 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
29147 ; GCN-NEXT: s_setpc_b64 s[30:31]
29149 ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32:
29151 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29152 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
29153 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29154 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29155 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29156 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29157 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29158 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29159 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
29160 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
29161 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
29162 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
29163 ; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
29164 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29166 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32:
29168 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29169 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29170 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29171 ; GFX8-NEXT: v_cvt_i32_f32_e32 v5, v0
29172 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29173 ; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
29174 ; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
29175 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
29176 ; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
29177 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
29178 ; GFX8-NEXT: v_mov_b32_e32 v1, v5
29179 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29181 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32:
29183 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29184 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29185 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29186 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v0
29187 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
29188 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
29189 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
29190 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
29191 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
29192 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
29193 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
29194 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29196 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32:
29198 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29199 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29200 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29201 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29202 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
29203 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
29204 ; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
29205 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
29206 ; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v5
29207 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29209 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32:
29211 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29212 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29213 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
29214 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
29215 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
29216 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29217 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
29218 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
29219 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29220 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
29221 ; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5
29222 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29223 %op = fptosi <4 x bfloat> %x to <4 x i32>
29227 define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
29228 ; GCN-LABEL: v_fptosi_bf16_to_i64:
29230 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29231 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29232 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29233 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29234 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29235 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29236 ; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4
29237 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
29238 ; GCN-NEXT: v_floor_f32_e32 v1, v1
29239 ; GCN-NEXT: v_fma_f32 v0, v1, s5, |v0|
29240 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29241 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29242 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
29243 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
29244 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
29245 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
29246 ; GCN-NEXT: s_setpc_b64 s[30:31]
29248 ; GFX7-LABEL: v_fptosi_bf16_to_i64:
29250 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29251 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29252 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29253 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29254 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29255 ; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4
29256 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29257 ; GFX7-NEXT: s_mov_b32 s4, 0xcf800000
29258 ; GFX7-NEXT: v_fma_f32 v2, v1, s4, |v0|
29259 ; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2
29260 ; GFX7-NEXT: v_cvt_u32_f32_e32 v1, v1
29261 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29262 ; GFX7-NEXT: v_xor_b32_e32 v0, v2, v3
29263 ; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
29264 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
29265 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
29266 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29268 ; GFX8-LABEL: v_fptosi_bf16_to_i64:
29270 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29271 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29272 ; GFX8-NEXT: v_trunc_f32_e32 v0, v0
29273 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29274 ; GFX8-NEXT: v_mul_f32_e64 v1, |v0|, s4
29275 ; GFX8-NEXT: v_floor_f32_e32 v1, v1
29276 ; GFX8-NEXT: s_mov_b32 s4, 0xcf800000
29277 ; GFX8-NEXT: v_fma_f32 v2, v1, s4, |v0|
29278 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
29279 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
29280 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29281 ; GFX8-NEXT: v_xor_b32_e32 v0, v2, v3
29282 ; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
29283 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
29284 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
29285 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29287 ; GFX9-LABEL: v_fptosi_bf16_to_i64:
29289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29290 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29291 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0
29292 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29293 ; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4
29294 ; GFX9-NEXT: v_floor_f32_e32 v1, v1
29295 ; GFX9-NEXT: s_mov_b32 s4, 0xcf800000
29296 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
29297 ; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0|
29298 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
29299 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29300 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3
29301 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3
29302 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
29303 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
29304 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29306 ; GFX10-LABEL: v_fptosi_bf16_to_i64:
29308 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29309 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29310 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
29311 ; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
29312 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29313 ; GFX10-NEXT: v_floor_f32_e32 v1, v1
29314 ; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
29315 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
29316 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v2
29317 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3
29318 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v3
29319 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
29320 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
29321 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29323 ; GFX11-LABEL: v_fptosi_bf16_to_i64:
29325 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29326 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
29327 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29328 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
29329 ; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
29330 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
29332 ; GFX11-NEXT: v_floor_f32_e32 v1, v1
29333 ; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
29334 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
29335 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29336 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2
29337 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
29338 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
29339 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3
29340 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
29341 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
29342 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
29343 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29344 %op = fptosi bfloat %x to i64
29348 define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
29349 ; GCN-LABEL: v_fptosi_v2bf16_to_v2i64:
29351 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29352 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29353 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29354 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29355 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29356 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29357 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29358 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29359 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
29360 ; GCN-NEXT: v_mul_f32_e64 v2, |v0|, s4
29361 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v0
29362 ; GCN-NEXT: v_mul_f32_e64 v4, |v1|, s4
29363 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
29364 ; GCN-NEXT: v_floor_f32_e32 v2, v2
29365 ; GCN-NEXT: v_floor_f32_e32 v4, v4
29366 ; GCN-NEXT: v_fma_f32 v0, v2, s5, |v0|
29367 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
29368 ; GCN-NEXT: v_fma_f32 v1, v4, s5, |v1|
29369 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
29370 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29371 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v3
29372 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29373 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v5
29374 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v3
29375 ; GCN-NEXT: v_xor_b32_e32 v6, v1, v5
29376 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
29377 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc
29378 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v6, v5
29379 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc
29380 ; GCN-NEXT: s_setpc_b64 s[30:31]
29382 ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64:
29384 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29385 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29386 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29387 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29388 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29389 ; GFX7-NEXT: v_mul_f32_e64 v2, |v0|, s4
29390 ; GFX7-NEXT: v_floor_f32_e32 v2, v2
29391 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
29392 ; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0|
29393 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
29394 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29395 ; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0
29396 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29397 ; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4
29398 ; GFX7-NEXT: v_trunc_f32_e32 v3, v1
29399 ; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4
29400 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29401 ; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2
29402 ; GFX7-NEXT: v_fma_f32 v5, v1, s5, |v3|
29403 ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
29404 ; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v1
29405 ; GFX7-NEXT: v_xor_b32_e32 v2, v2, v4
29406 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
29407 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3
29408 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc
29409 ; GFX7-NEXT: v_xor_b32_e32 v2, v5, v3
29410 ; GFX7-NEXT: v_xor_b32_e32 v4, v6, v3
29411 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
29412 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29413 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29415 ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i64:
29417 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29418 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29419 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1
29420 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29421 ; GFX8-NEXT: v_mul_f32_e64 v2, |v1|, s4
29422 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29423 ; GFX8-NEXT: v_floor_f32_e32 v2, v2
29424 ; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
29425 ; GFX8-NEXT: v_trunc_f32_e32 v4, v0
29426 ; GFX8-NEXT: v_fma_f32 v3, v2, s5, |v1|
29427 ; GFX8-NEXT: v_mul_f32_e64 v0, |v4|, s4
29428 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
29429 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
29430 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
29431 ; GFX8-NEXT: v_fma_f32 v5, v0, s5, |v4|
29432 ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
29433 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29434 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
29435 ; GFX8-NEXT: v_xor_b32_e32 v3, v3, v1
29436 ; GFX8-NEXT: v_xor_b32_e32 v2, v2, v1
29437 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v3, v1
29438 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4
29439 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
29440 ; GFX8-NEXT: v_xor_b32_e32 v2, v5, v3
29441 ; GFX8-NEXT: v_xor_b32_e32 v4, v6, v3
29442 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
29443 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29444 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29446 ; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
29448 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29449 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29450 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
29451 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29452 ; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4
29453 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29454 ; GFX9-NEXT: v_floor_f32_e32 v2, v2
29455 ; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
29456 ; GFX9-NEXT: v_trunc_f32_e32 v4, v0
29457 ; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1|
29458 ; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4
29459 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
29460 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
29461 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
29462 ; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4|
29463 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
29464 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29465 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
29466 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1
29467 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1
29468 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1
29469 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4
29470 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
29471 ; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3
29472 ; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3
29473 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
29474 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
29475 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29477 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
29479 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29480 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29481 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29482 ; GFX10-NEXT: v_trunc_f32_e32 v1, v1
29483 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
29484 ; GFX10-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
29485 ; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
29486 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v0
29487 ; GFX10-NEXT: v_floor_f32_e32 v2, v2
29488 ; GFX10-NEXT: v_floor_f32_e32 v3, v3
29489 ; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
29490 ; GFX10-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
29491 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29492 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
29493 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
29494 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v4
29495 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v5
29496 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1
29497 ; GFX10-NEXT: v_xor_b32_e32 v3, v3, v6
29498 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
29499 ; GFX10-NEXT: v_xor_b32_e32 v4, v4, v6
29500 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
29501 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
29502 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
29503 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
29504 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29506 ; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64:
29508 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29509 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
29510 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29511 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29512 ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
29513 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
29514 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29515 ; GFX11-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
29516 ; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
29517 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v0
29518 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29519 ; GFX11-NEXT: v_floor_f32_e32 v2, v2
29520 ; GFX11-NEXT: v_floor_f32_e32 v3, v3
29521 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
29522 ; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
29523 ; GFX11-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
29524 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29525 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
29526 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
29527 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v4
29528 ; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v5
29529 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29530 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1
29531 ; GFX11-NEXT: v_xor_b32_e32 v3, v3, v6
29532 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29533 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
29534 ; GFX11-NEXT: v_xor_b32_e32 v4, v4, v6
29535 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
29536 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
29537 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
29538 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
29539 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
29540 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29541 %op = fptosi <2 x bfloat> %x to <2 x i64>
29545 define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
29546 ; GCN-LABEL: v_fptosi_v3bf16_to_v3i64:
29548 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29549 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29550 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29551 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29552 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29553 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29554 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29555 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29556 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29557 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29558 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
29559 ; GCN-NEXT: v_trunc_f32_e32 v2, v2
29560 ; GCN-NEXT: v_mul_f32_e64 v3, |v0|, s4
29561 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
29562 ; GCN-NEXT: v_mul_f32_e64 v5, |v1|, s4
29563 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
29564 ; GCN-NEXT: v_mul_f32_e64 v7, |v2|, s4
29565 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v2
29566 ; GCN-NEXT: v_floor_f32_e32 v3, v3
29567 ; GCN-NEXT: v_floor_f32_e32 v5, v5
29568 ; GCN-NEXT: v_floor_f32_e32 v7, v7
29569 ; GCN-NEXT: v_fma_f32 v0, v3, s5, |v0|
29570 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
29571 ; GCN-NEXT: v_fma_f32 v1, v5, s5, |v1|
29572 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
29573 ; GCN-NEXT: v_fma_f32 v2, v7, s5, |v2|
29574 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7
29575 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29576 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v4
29577 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29578 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v6
29579 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
29580 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v8
29581 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
29582 ; GCN-NEXT: v_xor_b32_e32 v9, v1, v6
29583 ; GCN-NEXT: v_xor_b32_e32 v10, v2, v8
29584 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
29585 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc
29586 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v9, v6
29587 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v6, vcc
29588 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, v10, v8
29589 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, v7, v8, vcc
29590 ; GCN-NEXT: s_setpc_b64 s[30:31]
29592 ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64:
29594 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29595 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29596 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29597 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29598 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29599 ; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
29600 ; GFX7-NEXT: v_floor_f32_e32 v3, v3
29601 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
29602 ; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0|
29603 ; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4
29604 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29605 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0
29606 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29607 ; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5
29608 ; GFX7-NEXT: v_trunc_f32_e32 v4, v1
29609 ; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4
29610 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
29611 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29612 ; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4|
29613 ; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6
29614 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29615 ; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5
29616 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
29617 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1
29618 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc
29619 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4
29620 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29621 ; GFX7-NEXT: v_xor_b32_e32 v5, v6, v3
29622 ; GFX7-NEXT: v_trunc_f32_e32 v6, v2
29623 ; GFX7-NEXT: v_mul_f32_e64 v2, |v6|, s4
29624 ; GFX7-NEXT: v_floor_f32_e32 v2, v2
29625 ; GFX7-NEXT: v_xor_b32_e32 v4, v7, v3
29626 ; GFX7-NEXT: v_fma_f32 v7, v2, s5, |v6|
29627 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
29628 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v2
29629 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v5, v3
29630 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v6
29631 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29632 ; GFX7-NEXT: v_xor_b32_e32 v4, v7, v5
29633 ; GFX7-NEXT: v_xor_b32_e32 v6, v8, v5
29634 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5
29635 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
29636 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29638 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64:
29640 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29641 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29642 ; GFX8-NEXT: v_trunc_f32_e32 v2, v2
29643 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29644 ; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4
29645 ; GFX8-NEXT: v_floor_f32_e32 v3, v3
29646 ; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
29647 ; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
29648 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29649 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
29650 ; GFX8-NEXT: v_trunc_f32_e32 v5, v0
29651 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
29652 ; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
29653 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
29654 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
29655 ; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
29656 ; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
29657 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v6
29658 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29659 ; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
29660 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v0
29661 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
29662 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1
29663 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v3, v2, vcc
29664 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29665 ; GFX8-NEXT: v_mul_f32_e64 v5, |v1|, s4
29666 ; GFX8-NEXT: v_floor_f32_e32 v5, v5
29667 ; GFX8-NEXT: v_xor_b32_e32 v2, v7, v3
29668 ; GFX8-NEXT: v_fma_f32 v7, v5, s5, |v1|
29669 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
29670 ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
29671 ; GFX8-NEXT: v_xor_b32_e32 v4, v8, v3
29672 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
29673 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29674 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29675 ; GFX8-NEXT: v_xor_b32_e32 v4, v7, v1
29676 ; GFX8-NEXT: v_xor_b32_e32 v5, v5, v1
29677 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v1
29678 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
29679 ; GFX8-NEXT: v_mov_b32_e32 v1, v6
29680 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29682 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
29684 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29685 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29686 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
29687 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29688 ; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
29689 ; GFX9-NEXT: v_floor_f32_e32 v3, v3
29690 ; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
29691 ; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
29692 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29693 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
29694 ; GFX9-NEXT: v_trunc_f32_e32 v5, v0
29695 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
29696 ; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
29697 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
29698 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
29699 ; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
29700 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
29701 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
29702 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29703 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
29704 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0
29705 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
29706 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
29707 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
29708 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29709 ; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4
29710 ; GFX9-NEXT: v_floor_f32_e32 v5, v5
29711 ; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3
29712 ; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1|
29713 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
29714 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
29715 ; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3
29716 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
29717 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29718 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
29719 ; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1
29720 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1
29721 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
29722 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
29723 ; GFX9-NEXT: v_mov_b32_e32 v1, v6
29724 ; GFX9-NEXT: s_setpc_b64 s[30:31]
29726 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
29728 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29729 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29730 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29731 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29732 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2
29733 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
29734 ; GFX10-NEXT: v_trunc_f32_e32 v1, v1
29735 ; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
29736 ; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
29737 ; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
29738 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
29739 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
29740 ; GFX10-NEXT: v_floor_f32_e32 v3, v3
29741 ; GFX10-NEXT: v_floor_f32_e32 v4, v4
29742 ; GFX10-NEXT: v_floor_f32_e32 v6, v6
29743 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v1
29744 ; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
29745 ; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
29746 ; GFX10-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
29747 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
29748 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
29749 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
29750 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
29751 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
29752 ; GFX10-NEXT: v_xor_b32_e32 v3, v3, v5
29753 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
29754 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
29755 ; GFX10-NEXT: v_xor_b32_e32 v9, v0, v7
29756 ; GFX10-NEXT: v_xor_b32_e32 v4, v4, v7
29757 ; GFX10-NEXT: v_xor_b32_e32 v10, v1, v8
29758 ; GFX10-NEXT: v_xor_b32_e32 v6, v6, v8
29759 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
29760 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
29761 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
29762 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
29763 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
29764 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
29765 ; GFX10-NEXT: s_setpc_b64 s[30:31]
29767 ; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64:
29769 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29770 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29771 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29772 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29773 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29774 ; GFX11-NEXT: v_trunc_f32_e32 v2, v2
29775 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
29776 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29777 ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
29778 ; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
29779 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29780 ; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
29781 ; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
29782 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
29783 ; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
29784 ; GFX11-NEXT: v_floor_f32_e32 v3, v3
29785 ; GFX11-NEXT: v_floor_f32_e32 v4, v4
29786 ; GFX11-NEXT: v_floor_f32_e32 v6, v6
29787 ; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v1
29788 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
29789 ; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
29790 ; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
29791 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
29792 ; GFX11-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
29793 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
29794 ; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v4
29795 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
29796 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
29797 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
29798 ; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5
29799 ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
29800 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
29801 ; GFX11-NEXT: v_xor_b32_e32 v9, v0, v7
29802 ; GFX11-NEXT: v_xor_b32_e32 v4, v4, v7
29803 ; GFX11-NEXT: v_xor_b32_e32 v10, v1, v8
29804 ; GFX11-NEXT: v_xor_b32_e32 v6, v6, v8
29805 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
29806 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
29807 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
29808 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
29809 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
29810 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
29811 ; GFX11-NEXT: s_setpc_b64 s[30:31]
29812 %op = fptosi <3 x bfloat> %x to <3 x i64>
29816 define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
29817 ; GCN-LABEL: v_fptosi_v4bf16_to_v4i64:
29819 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29820 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
29821 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
29822 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
29823 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
29824 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000
29825 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000
29826 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29827 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29828 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29829 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
29830 ; GCN-NEXT: v_trunc_f32_e32 v0, v0
29831 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
29832 ; GCN-NEXT: v_trunc_f32_e32 v2, v2
29833 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
29834 ; GCN-NEXT: v_mul_f32_e64 v4, |v0|, s4
29835 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0
29836 ; GCN-NEXT: v_mul_f32_e64 v6, |v1|, s4
29837 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1
29838 ; GCN-NEXT: v_mul_f32_e64 v8, |v2|, s4
29839 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v2
29840 ; GCN-NEXT: v_mul_f32_e64 v10, |v3|, s4
29841 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3
29842 ; GCN-NEXT: v_floor_f32_e32 v4, v4
29843 ; GCN-NEXT: v_floor_f32_e32 v6, v6
29844 ; GCN-NEXT: v_floor_f32_e32 v8, v8
29845 ; GCN-NEXT: v_floor_f32_e32 v10, v10
29846 ; GCN-NEXT: v_fma_f32 v0, v4, s5, |v0|
29847 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
29848 ; GCN-NEXT: v_fma_f32 v1, v6, s5, |v1|
29849 ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
29850 ; GCN-NEXT: v_fma_f32 v2, v8, s5, |v2|
29851 ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8
29852 ; GCN-NEXT: v_fma_f32 v3, v10, s5, |v3|
29853 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
29854 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
29855 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v5
29856 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
29857 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v7
29858 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
29859 ; GCN-NEXT: v_xor_b32_e32 v8, v8, v9
29860 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
29861 ; GCN-NEXT: v_xor_b32_e32 v10, v10, v11
29862 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v5
29863 ; GCN-NEXT: v_xor_b32_e32 v12, v1, v7
29864 ; GCN-NEXT: v_xor_b32_e32 v13, v2, v9
29865 ; GCN-NEXT: v_xor_b32_e32 v14, v3, v11
29866 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
29867 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc
29868 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v12, v7
29869 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v6, v7, vcc
29870 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, v13, v9
29871 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, v8, v9, vcc
29872 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, v14, v11
29873 ; GCN-NEXT: v_subb_u32_e32 v7, vcc, v10, v11, vcc
29874 ; GCN-NEXT: s_setpc_b64 s[30:31]
29876 ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64:
29878 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29879 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
29880 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29881 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0
29882 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
29883 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3
29884 ; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
29885 ; GFX7-NEXT: v_floor_f32_e32 v3, v3
29886 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
29887 ; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0|
29888 ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
29889 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
29890 ; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0
29891 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29892 ; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6
29893 ; GFX7-NEXT: v_trunc_f32_e32 v5, v1
29894 ; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4
29895 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
29896 ; GFX7-NEXT: v_floor_f32_e32 v1, v1
29897 ; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5|
29898 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
29899 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
29900 ; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6
29901 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
29902 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1
29903 ; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
29904 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29905 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
29906 ; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3
29907 ; GFX7-NEXT: v_trunc_f32_e32 v7, v2
29908 ; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4
29909 ; GFX7-NEXT: v_floor_f32_e32 v2, v2
29910 ; GFX7-NEXT: v_xor_b32_e32 v5, v8, v3
29911 ; GFX7-NEXT: v_fma_f32 v8, v2, s5, |v7|
29912 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v8
29913 ; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v2
29914 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3
29915 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
29916 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7
29917 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
29918 ; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5
29919 ; GFX7-NEXT: v_trunc_f32_e32 v8, v4
29920 ; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4
29921 ; GFX7-NEXT: v_floor_f32_e32 v4, v4
29922 ; GFX7-NEXT: v_xor_b32_e32 v6, v9, v5
29923 ; GFX7-NEXT: v_fma_f32 v9, v4, s5, |v8|
29924 ; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v9
29925 ; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v4
29926 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v7, v5
29927 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v8
29928 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
29929 ; GFX7-NEXT: v_xor_b32_e32 v6, v9, v7
29930 ; GFX7-NEXT: v_xor_b32_e32 v8, v10, v7
29931 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
29932 ; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc
29933 ; GFX7-NEXT: s_setpc_b64 s[30:31]
29935 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64:
29937 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29938 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29939 ; GFX8-NEXT: v_trunc_f32_e32 v2, v2
29940 ; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
29941 ; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4
29942 ; GFX8-NEXT: v_floor_f32_e32 v3, v3
29943 ; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
29944 ; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
29945 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
29946 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
29947 ; GFX8-NEXT: v_trunc_f32_e32 v5, v0
29948 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
29949 ; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
29950 ; GFX8-NEXT: v_floor_f32_e32 v0, v0
29951 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
29952 ; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
29953 ; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
29954 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
29955 ; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
29956 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v0
29957 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
29958 ; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v3, v2, vcc
29959 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
29960 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
29961 ; GFX8-NEXT: v_trunc_f32_e32 v5, v5
29962 ; GFX8-NEXT: v_xor_b32_e32 v2, v6, v3
29963 ; GFX8-NEXT: v_mul_f32_e64 v6, |v5|, s4
29964 ; GFX8-NEXT: v_floor_f32_e32 v6, v6
29965 ; GFX8-NEXT: v_xor_b32_e32 v4, v7, v3
29966 ; GFX8-NEXT: v_fma_f32 v7, v6, s5, |v5|
29967 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
29968 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
29969 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
29970 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
29971 ; GFX8-NEXT: v_trunc_f32_e32 v1, v1
29972 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
29973 ; GFX8-NEXT: v_xor_b32_e32 v4, v7, v5
29974 ; GFX8-NEXT: v_mul_f32_e64 v7, |v1|, s4
29975 ; GFX8-NEXT: v_floor_f32_e32 v7, v7
29976 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
29977 ; GFX8-NEXT: v_fma_f32 v9, v7, s5, |v1|
29978 ; GFX8-NEXT: v_cvt_u32_f32_e32 v9, v9
29979 ; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
29980 ; GFX8-NEXT: v_xor_b32_e32 v6, v6, v5
29981 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
29982 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
29983 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
29984 ; GFX8-NEXT: v_xor_b32_e32 v6, v9, v1
29985 ; GFX8-NEXT: v_xor_b32_e32 v7, v7, v1
29986 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v1
29987 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v1, vcc
29988 ; GFX8-NEXT: v_mov_b32_e32 v1, v8
29989 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29991 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
29993 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29994 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
29995 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
29996 ; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
29997 ; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
29998 ; GFX9-NEXT: v_floor_f32_e32 v3, v3
29999 ; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
30000 ; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
30001 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30002 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
30003 ; GFX9-NEXT: v_trunc_f32_e32 v5, v0
30004 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
30005 ; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
30006 ; GFX9-NEXT: v_floor_f32_e32 v0, v0
30007 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
30008 ; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
30009 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
30010 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
30011 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
30012 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
30013 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
30014 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
30015 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
30016 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
30017 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
30018 ; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3
30019 ; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4
30020 ; GFX9-NEXT: v_floor_f32_e32 v6, v6
30021 ; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3
30022 ; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5|
30023 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
30024 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30025 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
30026 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
30027 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
30028 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
30029 ; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5
30030 ; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4
30031 ; GFX9-NEXT: v_floor_f32_e32 v7, v7
30032 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
30033 ; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1|
30034 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9
30035 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
30036 ; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5
30037 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
30038 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
30039 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
30040 ; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1
30041 ; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1
30042 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
30043 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
30044 ; GFX9-NEXT: v_mov_b32_e32 v1, v8
30045 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30047 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
30049 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30050 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
30051 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30052 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
30053 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30054 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2
30055 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
30056 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
30057 ; GFX10-NEXT: v_trunc_f32_e32 v4, v1
30058 ; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
30059 ; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
30060 ; GFX10-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
30061 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
30062 ; GFX10-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
30063 ; GFX10-NEXT: v_floor_f32_e32 v1, v1
30064 ; GFX10-NEXT: v_floor_f32_e32 v6, v6
30065 ; GFX10-NEXT: v_floor_f32_e32 v8, v8
30066 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
30067 ; GFX10-NEXT: v_floor_f32_e32 v9, v9
30068 ; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
30069 ; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
30070 ; GFX10-NEXT: v_ashrrev_i32_e32 v10, 31, v3
30071 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
30072 ; GFX10-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
30073 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
30074 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
30075 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
30076 ; GFX10-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
30077 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5
30078 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
30079 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v3
30080 ; GFX10-NEXT: v_xor_b32_e32 v3, v0, v7
30081 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
30082 ; GFX10-NEXT: v_xor_b32_e32 v6, v6, v7
30083 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
30084 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
30085 ; GFX10-NEXT: v_ashrrev_i32_e32 v13, 31, v4
30086 ; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v9
30087 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
30088 ; GFX10-NEXT: v_xor_b32_e32 v4, v12, v10
30089 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
30090 ; GFX10-NEXT: v_xor_b32_e32 v5, v8, v10
30091 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
30092 ; GFX10-NEXT: v_xor_b32_e32 v6, v11, v13
30093 ; GFX10-NEXT: v_xor_b32_e32 v7, v9, v13
30094 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
30095 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
30096 ; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
30097 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
30098 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30100 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64:
30102 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30103 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
30104 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30105 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
30106 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30107 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30108 ; GFX11-NEXT: v_trunc_f32_e32 v2, v2
30109 ; GFX11-NEXT: v_trunc_f32_e32 v0, v0
30110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30111 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
30112 ; GFX11-NEXT: v_trunc_f32_e32 v4, v1
30113 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30114 ; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
30115 ; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
30116 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
30117 ; GFX11-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
30118 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
30119 ; GFX11-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
30120 ; GFX11-NEXT: v_floor_f32_e32 v1, v1
30121 ; GFX11-NEXT: v_floor_f32_e32 v6, v6
30122 ; GFX11-NEXT: v_floor_f32_e32 v8, v8
30123 ; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
30124 ; GFX11-NEXT: v_floor_f32_e32 v9, v9
30125 ; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
30126 ; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
30127 ; GFX11-NEXT: v_ashrrev_i32_e32 v10, 31, v3
30128 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
30129 ; GFX11-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
30130 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
30131 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
30132 ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
30133 ; GFX11-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
30134 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
30135 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
30136 ; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v3
30137 ; GFX11-NEXT: v_xor_b32_e32 v3, v0, v7
30138 ; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8
30139 ; GFX11-NEXT: v_xor_b32_e32 v6, v6, v7
30140 ; GFX11-NEXT: v_cvt_u32_f32_e32 v11, v11
30141 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
30142 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v4
30143 ; GFX11-NEXT: v_cvt_u32_f32_e32 v9, v9
30144 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
30145 ; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10
30146 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
30147 ; GFX11-NEXT: v_xor_b32_e32 v5, v8, v10
30148 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
30149 ; GFX11-NEXT: v_xor_b32_e32 v6, v11, v13
30150 ; GFX11-NEXT: v_xor_b32_e32 v7, v9, v13
30151 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
30152 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
30153 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30154 ; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
30155 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
30156 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30157 %op = fptosi <4 x bfloat> %x to <4 x i64>
30161 define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
30162 ; GCN-LABEL: v_sitofp_i16_to_bf16:
30164 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30165 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30166 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30167 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30168 ; GCN-NEXT: s_setpc_b64 s[30:31]
30170 ; GFX7-LABEL: v_sitofp_i16_to_bf16:
30172 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30173 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30174 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30175 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30176 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30178 ; GFX8-LABEL: v_sitofp_i16_to_bf16:
30180 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30181 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30182 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
30183 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
30184 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
30185 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
30186 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30187 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
30188 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30189 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30191 ; GFX9-LABEL: v_sitofp_i16_to_bf16:
30193 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30194 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30195 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30196 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
30197 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
30198 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
30199 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30200 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
30201 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30202 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30204 ; GFX10-LABEL: v_sitofp_i16_to_bf16:
30206 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30207 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30208 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
30209 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
30210 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30211 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30212 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30213 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30214 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30216 ; GFX11-LABEL: v_sitofp_i16_to_bf16:
30218 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30219 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
30220 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
30221 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30222 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
30223 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
30224 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30225 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
30226 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30227 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30228 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30229 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30230 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30231 %op = sitofp i16 %x to bfloat
30235 define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
30236 ; GCN-LABEL: v_sitofp_v2i16_to_v2bf16:
30238 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30239 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30240 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
30241 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30242 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30243 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30244 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30245 ; GCN-NEXT: s_setpc_b64 s[30:31]
30247 ; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16:
30249 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30250 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30251 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
30252 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30253 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30254 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30255 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30256 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30258 ; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
30260 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30261 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30262 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30263 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30264 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30265 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30266 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
30267 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30268 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
30269 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
30270 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
30271 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
30272 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
30273 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30274 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30275 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30276 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
30277 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30279 ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
30281 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30282 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30283 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30284 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30285 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
30286 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
30287 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
30288 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30289 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
30290 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
30291 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
30292 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
30293 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30294 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30295 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30296 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
30297 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30299 ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
30301 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30302 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30303 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30304 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
30305 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
30306 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
30307 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30308 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
30309 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
30310 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30311 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
30312 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30313 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
30314 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
30315 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30317 ; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
30319 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30320 ; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16
30321 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30322 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30323 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
30324 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30325 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30326 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
30327 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
30328 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
30329 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30330 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
30331 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
30332 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30333 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
30334 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
30335 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30336 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
30337 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30338 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
30339 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30340 %op = sitofp <2 x i16> %x to <2 x bfloat>
30341 ret <2 x bfloat> %op
30344 define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
30345 ; GCN-LABEL: v_sitofp_v3i16_to_v3bf16:
30347 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30348 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30349 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
30350 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
30351 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
30352 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30353 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30354 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30355 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30356 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30357 ; GCN-NEXT: s_setpc_b64 s[30:31]
30359 ; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16:
30361 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30362 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30363 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
30364 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
30365 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30366 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30367 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
30368 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30369 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30370 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30371 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30373 ; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
30375 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30376 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30377 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30378 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30379 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30380 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30381 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30382 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
30383 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30384 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
30385 ; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1
30386 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
30387 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30388 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
30389 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
30390 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
30391 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
30392 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
30393 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30394 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
30395 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30396 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
30397 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30398 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30399 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
30400 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30402 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
30404 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30405 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30406 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30407 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30408 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30409 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
30410 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
30411 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
30412 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30413 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
30414 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
30415 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
30416 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
30417 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30418 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
30419 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
30420 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
30421 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
30422 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30423 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
30424 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30425 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
30426 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
30427 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30429 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
30431 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30432 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30433 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30434 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30435 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
30436 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
30437 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
30438 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30439 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
30440 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
30441 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
30442 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
30443 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
30444 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
30445 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30446 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30447 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30448 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30449 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
30450 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30451 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
30452 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30454 ; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16:
30455 ; GFX11TRUE16: ; %bb.0:
30456 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30457 ; GFX11TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 16
30458 ; GFX11TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16
30459 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30460 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30461 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
30462 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
30463 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30464 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
30465 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
30466 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30467 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
30468 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
30469 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
30470 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30471 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
30472 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
30473 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
30474 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
30475 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
30476 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30477 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30478 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30479 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30480 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30481 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30482 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
30483 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
30484 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
30485 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
30487 ; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
30488 ; GFX11FAKE16: ; %bb.0:
30489 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30490 ; GFX11FAKE16-NEXT: v_bfe_i32 v2, v0, 0, 16
30491 ; GFX11FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 16
30492 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30493 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30494 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
30495 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
30496 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30497 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
30498 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
30499 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30500 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
30501 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
30502 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
30503 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30504 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
30505 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
30506 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
30507 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
30508 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
30509 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
30510 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30511 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30512 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
30513 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30514 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
30515 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
30516 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
30517 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
30518 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
30519 %op = sitofp <3 x i16> %x to <3 x bfloat>
30520 ret <3 x bfloat> %op
30523 define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
30524 ; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
30526 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30527 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
30528 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
30529 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
30530 ; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16
30531 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
30532 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
30533 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30534 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30535 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30536 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30537 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30538 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
30539 ; GCN-NEXT: s_setpc_b64 s[30:31]
30541 ; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16:
30543 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30544 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
30545 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
30546 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
30547 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
30548 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30549 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30550 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
30551 ; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
30552 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30553 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30554 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30555 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
30556 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30558 ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
30560 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30561 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30562 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30563 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30564 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
30565 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
30566 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
30567 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30568 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
30569 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30570 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
30571 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30572 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30573 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
30574 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
30575 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30576 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
30577 ; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1
30578 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30579 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
30580 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
30581 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5
30582 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
30583 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
30584 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
30585 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
30586 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30587 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
30588 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30589 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
30590 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30591 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30592 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
30593 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
30594 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30596 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
30598 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30599 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30600 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30601 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30602 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
30603 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
30604 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
30605 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30606 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
30607 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30608 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30609 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
30610 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
30611 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
30612 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30613 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
30614 ; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
30615 ; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
30616 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
30617 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
30618 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
30619 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
30620 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
30621 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
30622 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30623 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
30624 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30625 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
30626 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
30627 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30629 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
30631 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30632 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30633 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
30634 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30635 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
30636 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
30637 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
30638 ; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
30639 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30640 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
30641 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30642 ; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
30643 ; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
30644 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
30645 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
30646 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
30647 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
30648 ; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
30649 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
30650 ; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
30651 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
30652 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30653 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
30654 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30655 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
30656 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
30657 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
30658 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30660 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
30662 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30663 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
30664 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
30665 ; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16
30666 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
30667 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30668 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
30669 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
30670 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
30671 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
30672 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30673 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
30674 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
30675 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
30676 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
30677 ; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
30678 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30679 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30680 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
30681 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
30682 ; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1
30683 ; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
30684 ; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
30685 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
30686 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
30687 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
30688 ; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
30689 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
30690 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30691 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
30692 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
30693 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30694 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
30695 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
30696 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30697 ; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
30698 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30699 %op = sitofp <4 x i16> %x to <4 x bfloat>
30700 ret <4 x bfloat> %op
30703 define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
30704 ; GCN-LABEL: v_sitofp_i32_to_bf16:
30706 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30707 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30708 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30709 ; GCN-NEXT: s_setpc_b64 s[30:31]
30711 ; GFX7-LABEL: v_sitofp_i32_to_bf16:
30713 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30714 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30715 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30716 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30718 ; GFX8-LABEL: v_sitofp_i32_to_bf16:
30720 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30721 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
30722 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
30723 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
30724 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
30725 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
30726 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30727 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
30728 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30729 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30731 ; GFX9-LABEL: v_sitofp_i32_to_bf16:
30733 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30734 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
30735 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30736 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
30737 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
30738 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
30739 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30740 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
30741 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30742 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30744 ; GFX10-LABEL: v_sitofp_i32_to_bf16:
30746 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30747 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
30748 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
30749 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
30750 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30751 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30752 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30753 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30754 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30756 ; GFX11-LABEL: v_sitofp_i32_to_bf16:
30758 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30759 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30760 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
30761 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
30762 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
30763 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30764 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
30765 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
30766 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
30767 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
30768 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30769 %op = sitofp i32 %x to bfloat
30773 define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
30774 ; GCN-LABEL: v_sitofp_v2i32_to_v2bf16:
30776 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30777 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30778 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30779 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30780 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30781 ; GCN-NEXT: s_setpc_b64 s[30:31]
30783 ; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16:
30785 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30786 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30787 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30788 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30789 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30790 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30792 ; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
30794 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30795 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
30796 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
30797 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
30798 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
30799 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30800 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
30801 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30802 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
30803 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
30804 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
30805 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
30806 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
30807 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30808 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
30809 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30810 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
30811 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30813 ; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
30815 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30816 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
30817 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
30818 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30819 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
30820 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
30821 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
30822 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30823 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30824 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
30825 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
30826 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
30827 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30828 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
30829 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30830 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
30831 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30833 ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
30835 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30836 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
30837 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
30838 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
30839 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
30840 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
30841 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30842 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
30843 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
30844 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
30845 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
30846 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30847 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
30848 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
30849 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30851 ; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
30853 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30854 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
30855 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
30856 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
30857 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
30858 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
30859 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
30860 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30861 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
30862 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
30863 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
30864 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
30865 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
30866 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30867 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
30868 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
30869 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
30870 ; GFX11-NEXT: s_setpc_b64 s[30:31]
30871 %op = sitofp <2 x i32> %x to <2 x bfloat>
30872 ret <2 x bfloat> %op
30875 define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
30876 ; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
30878 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30879 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
30880 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
30881 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
30882 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30883 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30884 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30885 ; GCN-NEXT: s_setpc_b64 s[30:31]
30887 ; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16:
30889 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30890 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
30891 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
30892 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
30893 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
30894 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
30895 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
30896 ; GFX7-NEXT: s_setpc_b64 s[30:31]
30898 ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
30900 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30901 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
30902 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
30903 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
30904 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
30905 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
30906 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30907 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
30908 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30909 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
30910 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
30911 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
30912 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30913 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
30914 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30915 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
30916 ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
30917 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
30918 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
30919 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
30920 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30921 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
30922 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
30923 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
30924 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
30925 ; GFX8-NEXT: v_mov_b32_e32 v1, v2
30926 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30928 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
30930 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30931 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
30932 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
30933 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
30934 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
30935 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
30936 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
30937 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
30938 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
30939 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
30940 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
30941 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
30942 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
30943 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
30944 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
30945 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
30946 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
30947 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
30948 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
30949 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
30950 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
30951 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
30952 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
30953 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30955 ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
30957 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30958 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
30959 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
30960 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
30961 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
30962 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
30963 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
30964 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30965 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
30966 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30967 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
30968 ; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
30969 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
30970 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30971 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
30972 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30973 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
30974 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
30975 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
30976 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
30977 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
30978 ; GFX10-NEXT: s_setpc_b64 s[30:31]
30980 ; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16:
30981 ; GFX11TRUE16: ; %bb.0:
30982 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30983 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
30984 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
30985 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
30986 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
30987 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
30988 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
30989 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
30990 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
30991 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
30992 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
30993 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
30994 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
30995 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
30996 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
30997 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
30998 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
30999 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
31000 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31001 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
31002 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31003 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
31004 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
31005 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
31007 ; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
31008 ; GFX11FAKE16: ; %bb.0:
31009 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31010 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
31011 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
31012 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
31013 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31014 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
31015 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
31016 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
31017 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31018 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
31019 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31020 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
31021 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
31022 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
31023 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31024 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31025 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31026 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
31027 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31028 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
31029 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31030 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
31031 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
31032 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
31033 %op = sitofp <3 x i32> %x to <3 x bfloat>
31034 ret <3 x bfloat> %op
31037 define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
31038 ; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
31040 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31041 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
31042 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
31043 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
31044 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31045 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31046 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31047 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
31048 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
31049 ; GCN-NEXT: s_setpc_b64 s[30:31]
31051 ; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16:
31053 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31054 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31055 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
31056 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
31057 ; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
31058 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31059 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31060 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
31061 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
31062 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31064 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
31066 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31067 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
31068 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
31069 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31070 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
31071 ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
31072 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
31073 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
31074 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
31075 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
31076 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
31077 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
31078 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
31079 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
31080 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3
31081 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
31082 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
31083 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
31084 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
31085 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
31086 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
31087 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
31088 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31089 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
31090 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
31091 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
31092 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
31093 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
31094 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
31095 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
31096 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
31097 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
31098 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
31099 ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
31100 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31102 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
31104 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31105 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
31106 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
31107 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31108 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31109 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
31110 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
31111 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
31112 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
31113 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
31114 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
31115 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
31116 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
31117 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
31118 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
31119 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
31120 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
31121 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
31122 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
31123 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31124 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
31125 ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
31126 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
31127 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
31128 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
31129 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
31130 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
31131 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
31132 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
31133 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31135 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
31137 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31138 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
31139 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31140 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
31141 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3
31142 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
31143 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
31144 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
31145 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31146 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
31147 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31148 ; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
31149 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
31150 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
31151 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
31152 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
31153 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31154 ; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
31155 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
31156 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
31157 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
31158 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31159 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
31160 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
31161 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31162 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
31163 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
31164 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31166 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
31168 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31169 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
31170 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
31171 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
31172 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
31173 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
31174 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
31175 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
31176 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
31177 ; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1
31178 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31179 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
31180 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
31181 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
31182 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
31183 ; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
31184 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
31185 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31186 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
31187 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
31188 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
31189 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31190 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
31191 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
31192 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
31193 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
31194 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31195 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
31196 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
31197 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
31198 ; GFX11-NEXT: s_setpc_b64 s[30:31]
31199 %op = sitofp <4 x i32> %x to <4 x bfloat>
31200 ret <4 x bfloat> %op
31203 define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
31204 ; GCN-LABEL: v_sitofp_i64_to_bf16:
31206 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31207 ; GCN-NEXT: v_xor_b32_e32 v2, v0, v1
31208 ; GCN-NEXT: v_ffbh_i32_e32 v3, v1
31209 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31210 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31211 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 32, v2
31212 ; GCN-NEXT: v_min_u32_e32 v2, v3, v2
31213 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
31214 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
31215 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
31216 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31217 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
31218 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
31219 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31220 ; GCN-NEXT: s_setpc_b64 s[30:31]
31222 ; GFX7-LABEL: v_sitofp_i64_to_bf16:
31224 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31225 ; GFX7-NEXT: v_xor_b32_e32 v2, v0, v1
31226 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31227 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
31228 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v2
31229 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31230 ; GFX7-NEXT: v_min_u32_e32 v2, v3, v2
31231 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
31232 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
31233 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
31234 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31235 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
31236 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
31237 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31238 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31240 ; GFX8-LABEL: v_sitofp_i64_to_bf16:
31242 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31243 ; GFX8-NEXT: v_xor_b32_e32 v2, v0, v1
31244 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31245 ; GFX8-NEXT: v_ffbh_i32_e32 v3, v1
31246 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v2
31247 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, -1, v3
31248 ; GFX8-NEXT: v_min_u32_e32 v2, v3, v2
31249 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31250 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31251 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31252 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31253 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
31254 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
31255 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
31256 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
31257 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
31258 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
31259 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31260 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
31261 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31262 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31264 ; GFX9-LABEL: v_sitofp_i64_to_bf16:
31266 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31267 ; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1
31268 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31269 ; GFX9-NEXT: v_ffbh_i32_e32 v3, v1
31270 ; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
31271 ; GFX9-NEXT: v_add_u32_e32 v3, -1, v3
31272 ; GFX9-NEXT: v_min_u32_e32 v2, v3, v2
31273 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31274 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31275 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31276 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31277 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31278 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
31279 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
31280 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
31281 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
31282 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
31283 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31284 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
31285 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31286 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31288 ; GFX10-LABEL: v_sitofp_i64_to_bf16:
31290 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31291 ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1
31292 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
31293 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31294 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
31295 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
31296 ; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
31297 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31298 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
31299 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
31300 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
31301 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31302 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
31303 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
31304 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
31305 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31306 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
31307 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
31308 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31309 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31311 ; GFX11-LABEL: v_sitofp_i64_to_bf16:
31313 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31314 ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1
31315 ; GFX11-NEXT: v_cls_i32_e32 v3, v1
31316 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31317 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2
31318 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
31319 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
31320 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
31321 ; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
31322 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
31323 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
31324 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
31325 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31326 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
31327 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
31328 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
31329 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
31330 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
31331 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
31332 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
31333 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31334 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
31335 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
31336 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
31337 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
31338 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31339 ; GFX11-NEXT: s_setpc_b64 s[30:31]
31340 %op = sitofp i64 %x to bfloat
31344 define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
31345 ; GCN-LABEL: v_sitofp_v2i64_to_v2bf16:
31347 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31348 ; GCN-NEXT: v_ffbh_i32_e32 v4, v3
31349 ; GCN-NEXT: v_xor_b32_e32 v5, v2, v3
31350 ; GCN-NEXT: v_ffbh_i32_e32 v6, v1
31351 ; GCN-NEXT: v_xor_b32_e32 v7, v0, v1
31352 ; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v4
31353 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31354 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31355 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31356 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31357 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31358 ; GCN-NEXT: v_min_u32_e32 v4, v4, v5
31359 ; GCN-NEXT: v_min_u32_e32 v5, v6, v7
31360 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
31361 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
31362 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
31363 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5
31364 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
31365 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
31366 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
31367 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
31368 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2
31369 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31370 ; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
31371 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
31372 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31373 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31374 ; GCN-NEXT: s_setpc_b64 s[30:31]
31376 ; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16:
31378 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31379 ; GFX7-NEXT: v_xor_b32_e32 v5, v2, v3
31380 ; GFX7-NEXT: v_ffbh_i32_e32 v4, v3
31381 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31382 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4
31383 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31384 ; GFX7-NEXT: v_min_u32_e32 v4, v4, v5
31385 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
31386 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
31387 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
31388 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
31389 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
31390 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31391 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31392 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31393 ; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
31394 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
31395 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
31396 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
31397 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
31398 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31399 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
31400 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
31401 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
31402 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
31403 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31404 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31405 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31407 ; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
31409 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31410 ; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1
31411 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v1
31412 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31413 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
31414 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
31415 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v5
31416 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31417 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
31418 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31419 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31420 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31421 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
31422 ; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
31423 ; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
31424 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
31425 ; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
31426 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0
31427 ; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
31428 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
31429 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
31430 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
31431 ; GFX8-NEXT: v_min_u32_e32 v6, v0, v1
31432 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
31433 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
31434 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31435 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31436 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31437 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31438 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
31439 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
31440 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
31441 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
31442 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
31443 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
31444 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
31445 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31446 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
31447 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
31448 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
31449 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31451 ; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
31453 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31454 ; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1
31455 ; GFX9-NEXT: v_ffbh_i32_e32 v4, v1
31456 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31457 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
31458 ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
31459 ; GFX9-NEXT: v_min_u32_e32 v4, v4, v5
31460 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31461 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31462 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31463 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31464 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31465 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
31466 ; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
31467 ; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
31468 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
31469 ; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
31470 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
31471 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
31472 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
31473 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
31474 ; GFX9-NEXT: v_min_u32_e32 v6, v0, v1
31475 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
31476 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
31477 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31478 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31479 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31480 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31481 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
31482 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
31483 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
31484 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
31485 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
31486 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
31487 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31488 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
31489 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
31490 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
31491 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31493 ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
31495 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31496 ; GFX10-NEXT: v_xor_b32_e32 v4, v0, v1
31497 ; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3
31498 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v1
31499 ; GFX10-NEXT: v_ffbh_i32_e32 v7, v3
31500 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4
31501 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31502 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
31503 ; GFX10-NEXT: v_add_nc_u32_e32 v7, -1, v7
31504 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 32, v4
31505 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 32, v5
31506 ; GFX10-NEXT: v_min_u32_e32 v4, v6, v4
31507 ; GFX10-NEXT: v_min_u32_e32 v5, v7, v5
31508 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31509 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
31510 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
31511 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
31512 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
31513 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
31514 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4
31515 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5
31516 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31517 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
31518 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
31519 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
31520 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
31521 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
31522 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
31523 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31524 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
31525 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
31526 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
31527 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
31528 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31529 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
31530 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31531 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31533 ; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
31535 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31536 ; GFX11-NEXT: v_xor_b32_e32 v4, v0, v1
31537 ; GFX11-NEXT: v_xor_b32_e32 v5, v2, v3
31538 ; GFX11-NEXT: v_cls_i32_e32 v6, v1
31539 ; GFX11-NEXT: v_cls_i32_e32 v7, v3
31540 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31541 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v4
31542 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31543 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31544 ; GFX11-NEXT: v_add_nc_u32_e32 v6, -1, v6
31545 ; GFX11-NEXT: v_add_nc_u32_e32 v7, -1, v7
31546 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31547 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 32, v4
31548 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 32, v5
31549 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31550 ; GFX11-NEXT: v_min_u32_e32 v4, v6, v4
31551 ; GFX11-NEXT: v_min_u32_e32 v5, v7, v5
31552 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31553 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
31554 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
31555 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31556 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
31557 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
31558 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31559 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
31560 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
31561 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4
31562 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5
31563 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31564 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
31565 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
31566 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31567 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
31568 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
31569 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
31570 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
31571 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
31572 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
31573 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31574 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
31575 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
31576 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
31577 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
31578 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
31579 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31580 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
31581 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
31582 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
31583 ; GFX11-NEXT: s_setpc_b64 s[30:31]
31584 %op = sitofp <2 x i64> %x to <2 x bfloat>
31585 ret <2 x bfloat> %op
31588 define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
31589 ; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
31591 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31592 ; GCN-NEXT: v_ffbh_i32_e32 v6, v5
31593 ; GCN-NEXT: v_xor_b32_e32 v7, v4, v5
31594 ; GCN-NEXT: v_ffbh_i32_e32 v8, v3
31595 ; GCN-NEXT: v_xor_b32_e32 v9, v2, v3
31596 ; GCN-NEXT: v_ffbh_i32_e32 v10, v1
31597 ; GCN-NEXT: v_xor_b32_e32 v11, v0, v1
31598 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31599 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31600 ; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
31601 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31602 ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10
31603 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
31604 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31605 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9
31606 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11
31607 ; GCN-NEXT: v_min_u32_e32 v6, v6, v7
31608 ; GCN-NEXT: v_min_u32_e32 v7, v8, v9
31609 ; GCN-NEXT: v_min_u32_e32 v8, v10, v11
31610 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
31611 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
31612 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
31613 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
31614 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
31615 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
31616 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
31617 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
31618 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
31619 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
31620 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
31621 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
31622 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v4
31623 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
31624 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
31625 ; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
31626 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
31627 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
31628 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31629 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31630 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
31631 ; GCN-NEXT: s_setpc_b64 s[30:31]
31633 ; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16:
31635 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31636 ; GFX7-NEXT: v_xor_b32_e32 v7, v4, v5
31637 ; GFX7-NEXT: v_ffbh_i32_e32 v6, v5
31638 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31639 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31640 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31641 ; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
31642 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
31643 ; GFX7-NEXT: v_xor_b32_e32 v7, v2, v3
31644 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
31645 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
31646 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
31647 ; GFX7-NEXT: v_ffbh_i32_e32 v6, v3
31648 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31649 ; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
31650 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
31651 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
31652 ; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
31653 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
31654 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
31655 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
31656 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
31657 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
31658 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
31659 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
31660 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
31661 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
31662 ; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
31663 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
31664 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
31665 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
31666 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
31667 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
31668 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
31669 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
31670 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
31671 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
31672 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
31673 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
31674 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
31675 ; GFX7-NEXT: s_setpc_b64 s[30:31]
31677 ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
31679 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31680 ; GFX8-NEXT: v_xor_b32_e32 v7, v4, v5
31681 ; GFX8-NEXT: v_ffbh_i32_e32 v6, v5
31682 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31683 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6
31684 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7
31685 ; GFX8-NEXT: v_min_u32_e32 v6, v6, v7
31686 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31687 ; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1
31688 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
31689 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
31690 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
31691 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
31692 ; GFX8-NEXT: v_ffbh_i32_e32 v7, v1
31693 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
31694 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31695 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
31696 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7
31697 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8
31698 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
31699 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
31700 ; GFX8-NEXT: v_min_u32_e32 v7, v7, v8
31701 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
31702 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
31703 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
31704 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31705 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
31706 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
31707 ; GFX8-NEXT: v_xor_b32_e32 v6, v2, v3
31708 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
31709 ; GFX8-NEXT: v_ffbh_i32_e32 v5, v3
31710 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
31711 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
31712 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5
31713 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6
31714 ; GFX8-NEXT: v_min_u32_e32 v5, v5, v6
31715 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
31716 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
31717 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7
31718 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
31719 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
31720 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
31721 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
31722 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
31723 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
31724 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
31725 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
31726 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31727 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
31728 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
31729 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
31730 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
31731 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
31732 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
31733 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
31734 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
31735 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
31736 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
31737 ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
31738 ; GFX8-NEXT: s_setpc_b64 s[30:31]
31740 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
31742 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31743 ; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5
31744 ; GFX9-NEXT: v_ffbh_i32_e32 v6, v5
31745 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31746 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
31747 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
31748 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
31749 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31750 ; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1
31751 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
31752 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
31753 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
31754 ; GFX9-NEXT: v_ffbh_i32_e32 v6, v1
31755 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31756 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
31757 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
31758 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
31759 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
31760 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
31761 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31762 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31763 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31764 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
31765 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
31766 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
31767 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
31768 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
31769 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
31770 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
31771 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
31772 ; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
31773 ; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
31774 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
31775 ; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
31776 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
31777 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
31778 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
31779 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
31780 ; GFX9-NEXT: v_min_u32_e32 v7, v0, v1
31781 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
31782 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
31783 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
31784 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
31785 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
31786 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
31787 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
31788 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
31789 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
31790 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
31791 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
31792 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
31793 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
31794 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
31795 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
31796 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
31797 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
31798 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31800 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
31802 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31803 ; GFX10-NEXT: v_xor_b32_e32 v8, v0, v1
31804 ; GFX10-NEXT: v_xor_b32_e32 v7, v4, v5
31805 ; GFX10-NEXT: v_xor_b32_e32 v9, v2, v3
31806 ; GFX10-NEXT: v_ffbh_i32_e32 v10, v1
31807 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v5
31808 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31809 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31810 ; GFX10-NEXT: v_ffbh_i32_e32 v11, v3
31811 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31812 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
31813 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
31814 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
31815 ; GFX10-NEXT: v_add_nc_u32_e32 v7, 32, v7
31816 ; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11
31817 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
31818 ; GFX10-NEXT: v_min_u32_e32 v8, v10, v8
31819 ; GFX10-NEXT: v_min_u32_e32 v6, v6, v7
31820 ; GFX10-NEXT: v_min_u32_e32 v7, v11, v9
31821 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
31822 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31823 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v6
31824 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
31825 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
31826 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
31827 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
31828 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
31829 ; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
31830 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7
31831 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
31832 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8
31833 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
31834 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
31835 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
31836 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
31837 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
31838 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v4
31839 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
31840 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
31841 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31842 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
31843 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
31844 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31845 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
31846 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
31847 ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
31848 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
31849 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31850 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31851 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31852 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31853 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
31854 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31855 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
31856 ; GFX10-NEXT: s_setpc_b64 s[30:31]
31858 ; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16:
31859 ; GFX11TRUE16: ; %bb.0:
31860 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31861 ; GFX11TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1
31862 ; GFX11TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5
31863 ; GFX11TRUE16-NEXT: v_xor_b32_e32 v9, v2, v3
31864 ; GFX11TRUE16-NEXT: v_cls_i32_e32 v10, v1
31865 ; GFX11TRUE16-NEXT: v_cls_i32_e32 v6, v5
31866 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31867 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31868 ; GFX11TRUE16-NEXT: v_cls_i32_e32 v11, v3
31869 ; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31870 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
31871 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
31872 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
31873 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v7, 32, v7
31874 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v11, -1, v11
31875 ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
31876 ; GFX11TRUE16-NEXT: v_min_u32_e32 v8, v10, v8
31877 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31878 ; GFX11TRUE16-NEXT: v_min_u32_e32 v6, v6, v7
31879 ; GFX11TRUE16-NEXT: v_min_u32_e32 v7, v11, v9
31880 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31881 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
31882 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31883 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
31884 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31885 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
31886 ; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
31887 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31888 ; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
31889 ; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
31890 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31891 ; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
31892 ; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v4
31893 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
31894 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
31895 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
31896 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8
31897 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
31898 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
31899 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
31900 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31901 ; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
31902 ; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v6
31903 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31904 ; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
31905 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
31906 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
31907 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31908 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
31909 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
31910 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
31911 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31912 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
31913 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
31914 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
31915 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
31916 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31917 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31918 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31919 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31920 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31921 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
31922 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31923 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
31924 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
31925 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
31927 ; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
31928 ; GFX11FAKE16: ; %bb.0:
31929 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31930 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v8, v0, v1
31931 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v7, v4, v5
31932 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v9, v2, v3
31933 ; GFX11FAKE16-NEXT: v_cls_i32_e32 v10, v1
31934 ; GFX11FAKE16-NEXT: v_cls_i32_e32 v6, v5
31935 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
31936 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
31937 ; GFX11FAKE16-NEXT: v_cls_i32_e32 v11, v3
31938 ; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9
31939 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
31940 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
31941 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
31942 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v7, 32, v7
31943 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v11, -1, v11
31944 ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
31945 ; GFX11FAKE16-NEXT: v_min_u32_e32 v8, v10, v8
31946 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31947 ; GFX11FAKE16-NEXT: v_min_u32_e32 v6, v6, v7
31948 ; GFX11FAKE16-NEXT: v_min_u32_e32 v7, v11, v9
31949 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31950 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
31951 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
31952 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
31953 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
31954 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
31955 ; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
31956 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
31957 ; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
31958 ; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
31959 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31960 ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
31961 ; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
31962 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
31963 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
31964 ; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
31965 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8
31966 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
31967 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
31968 ; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
31969 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31970 ; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
31971 ; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v6
31972 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
31973 ; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
31974 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
31975 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
31976 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
31977 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
31978 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
31979 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
31980 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
31981 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
31982 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
31983 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
31984 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
31985 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
31986 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
31987 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
31988 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
31989 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
31990 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
31991 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
31992 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
31993 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
31994 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
31995 %op = sitofp <3 x i64> %x to <3 x bfloat>
31996 ret <3 x bfloat> %op
31999 define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
32000 ; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
32002 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32003 ; GCN-NEXT: v_ffbh_i32_e32 v8, v7
32004 ; GCN-NEXT: v_xor_b32_e32 v9, v6, v7
32005 ; GCN-NEXT: v_ffbh_i32_e32 v10, v5
32006 ; GCN-NEXT: v_xor_b32_e32 v11, v4, v5
32007 ; GCN-NEXT: v_ffbh_i32_e32 v12, v3
32008 ; GCN-NEXT: v_xor_b32_e32 v13, v2, v3
32009 ; GCN-NEXT: v_ffbh_i32_e32 v14, v1
32010 ; GCN-NEXT: v_xor_b32_e32 v15, v0, v1
32011 ; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
32012 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32013 ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10
32014 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
32015 ; GCN-NEXT: v_add_i32_e32 v12, vcc, -1, v12
32016 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v13
32017 ; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v14
32018 ; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v15
32019 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9
32020 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11
32021 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v13
32022 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 32, v15
32023 ; GCN-NEXT: v_min_u32_e32 v8, v8, v9
32024 ; GCN-NEXT: v_min_u32_e32 v9, v10, v11
32025 ; GCN-NEXT: v_min_u32_e32 v10, v12, v13
32026 ; GCN-NEXT: v_min_u32_e32 v11, v14, v15
32027 ; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
32028 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
32029 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
32030 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
32031 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
32032 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
32033 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
32034 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
32035 ; GCN-NEXT: v_min_u32_e32 v6, 1, v6
32036 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
32037 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
32038 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
32039 ; GCN-NEXT: v_or_b32_e32 v6, v7, v6
32040 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
32041 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
32042 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
32043 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v6
32044 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v4
32045 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
32046 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
32047 ; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
32048 ; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
32049 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
32050 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
32051 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
32052 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
32053 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
32054 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
32055 ; GCN-NEXT: s_setpc_b64 s[30:31]
32057 ; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16:
32059 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32060 ; GFX7-NEXT: v_xor_b32_e32 v9, v6, v7
32061 ; GFX7-NEXT: v_ffbh_i32_e32 v8, v7
32062 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32063 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
32064 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
32065 ; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
32066 ; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
32067 ; GFX7-NEXT: v_xor_b32_e32 v9, v4, v5
32068 ; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
32069 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
32070 ; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
32071 ; GFX7-NEXT: v_ffbh_i32_e32 v8, v5
32072 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32073 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
32074 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
32075 ; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
32076 ; GFX7-NEXT: v_cvt_f32_i32_e32 v6, v6
32077 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
32078 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
32079 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
32080 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
32081 ; GFX7-NEXT: v_xor_b32_e32 v8, v2, v3
32082 ; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
32083 ; GFX7-NEXT: v_ffbh_i32_e32 v7, v3
32084 ; GFX7-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32085 ; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
32086 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, -1, v7
32087 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v8
32088 ; GFX7-NEXT: v_min_u32_e32 v7, v7, v8
32089 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
32090 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
32091 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
32092 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
32093 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
32094 ; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
32095 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
32096 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
32097 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
32098 ; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
32099 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
32100 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
32101 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
32102 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
32103 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
32104 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
32105 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
32106 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
32107 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
32108 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
32109 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
32110 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
32111 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
32112 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32114 ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
32116 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32117 ; GFX8-NEXT: v_xor_b32_e32 v9, v4, v5
32118 ; GFX8-NEXT: v_ffbh_i32_e32 v8, v5
32119 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32120 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
32121 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
32122 ; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
32123 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32124 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
32125 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
32126 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
32127 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
32128 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
32129 ; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
32130 ; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
32131 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
32132 ; GFX8-NEXT: v_xor_b32_e32 v5, v6, v7
32133 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4
32134 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v7
32135 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
32136 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
32137 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
32138 ; GFX8-NEXT: v_min_u32_e32 v10, v4, v5
32139 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
32140 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
32141 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
32142 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
32143 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
32144 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
32145 ; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1
32146 ; GFX8-NEXT: v_ffbh_i32_e32 v8, v1
32147 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32148 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
32149 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
32150 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
32151 ; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
32152 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
32153 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
32154 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
32155 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
32156 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
32157 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
32158 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
32159 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
32160 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
32161 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
32162 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32163 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
32164 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
32165 ; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
32166 ; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
32167 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
32168 ; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
32169 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0
32170 ; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
32171 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
32172 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
32173 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
32174 ; GFX8-NEXT: v_min_u32_e32 v8, v0, v1
32175 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
32176 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6
32177 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
32178 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
32179 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
32180 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
32181 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
32182 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
32183 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
32184 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
32185 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
32186 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32187 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
32188 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32189 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32190 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
32191 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32192 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
32193 ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
32194 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32196 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
32198 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32199 ; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5
32200 ; GFX9-NEXT: v_ffbh_i32_e32 v8, v5
32201 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9
32202 ; GFX9-NEXT: v_add_u32_e32 v8, -1, v8
32203 ; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
32204 ; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
32205 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32206 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32207 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
32208 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
32209 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
32210 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
32211 ; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
32212 ; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
32213 ; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7
32214 ; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
32215 ; GFX9-NEXT: v_ffbh_i32_e32 v4, v7
32216 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
32217 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
32218 ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
32219 ; GFX9-NEXT: v_min_u32_e32 v10, v4, v5
32220 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
32221 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
32222 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
32223 ; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1
32224 ; GFX9-NEXT: v_ffbh_i32_e32 v7, v1
32225 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32226 ; GFX9-NEXT: v_add_u32_e32 v7, -1, v7
32227 ; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
32228 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
32229 ; GFX9-NEXT: v_min_u32_e32 v7, v7, v8
32230 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
32231 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
32232 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
32233 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
32234 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
32235 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
32236 ; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
32237 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
32238 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
32239 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
32240 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
32241 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
32242 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32243 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
32244 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
32245 ; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
32246 ; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
32247 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
32248 ; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
32249 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
32250 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
32251 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
32252 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
32253 ; GFX9-NEXT: v_min_u32_e32 v8, v0, v1
32254 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
32255 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
32256 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
32257 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
32258 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
32259 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
32260 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
32261 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
32262 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
32263 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
32264 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
32265 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
32266 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32267 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32268 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32269 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
32270 ; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
32271 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32273 ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
32275 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32276 ; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5
32277 ; GFX10-NEXT: v_ffbh_i32_e32 v9, v5
32278 ; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7
32279 ; GFX10-NEXT: v_xor_b32_e32 v13, v0, v1
32280 ; GFX10-NEXT: v_ffbh_i32_e32 v10, v7
32281 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32282 ; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9
32283 ; GFX10-NEXT: v_ffbh_i32_e32 v12, v1
32284 ; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3
32285 ; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v11
32286 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
32287 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
32288 ; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12
32289 ; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14
32290 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 32, v11
32291 ; GFX10-NEXT: v_min_u32_e32 v8, v9, v8
32292 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v13
32293 ; GFX10-NEXT: v_ffbh_i32_e32 v13, v3
32294 ; GFX10-NEXT: v_add_nc_u32_e32 v14, 32, v14
32295 ; GFX10-NEXT: v_min_u32_e32 v10, v10, v11
32296 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32297 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
32298 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13
32299 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
32300 ; GFX10-NEXT: v_min_u32_e32 v9, v12, v9
32301 ; GFX10-NEXT: v_min_u32_e32 v11, v13, v14
32302 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
32303 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
32304 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
32305 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
32306 ; GFX10-NEXT: v_min_u32_e32 v5, 1, v6
32307 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v8
32308 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
32309 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
32310 ; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v4
32311 ; GFX10-NEXT: v_or_b32_e32 v5, v7, v5
32312 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
32313 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
32314 ; GFX10-NEXT: v_ldexp_f32 v2, v4, v6
32315 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v5
32316 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v10
32317 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
32318 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9
32319 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
32320 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11
32321 ; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
32322 ; GFX10-NEXT: v_ldexp_f32 v3, v3, v4
32323 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
32324 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
32325 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
32326 ; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
32327 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
32328 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
32329 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32330 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
32331 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
32332 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32333 ; GFX10-NEXT: v_add3_u32 v4, v6, v3, 0x7fff
32334 ; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
32335 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
32336 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32337 ; GFX10-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
32338 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
32339 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
32340 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32341 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
32342 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32343 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
32344 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
32345 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
32346 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32348 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
32350 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32351 ; GFX11-NEXT: v_xor_b32_e32 v8, v4, v5
32352 ; GFX11-NEXT: v_cls_i32_e32 v9, v5
32353 ; GFX11-NEXT: v_xor_b32_e32 v11, v6, v7
32354 ; GFX11-NEXT: v_xor_b32_e32 v13, v0, v1
32355 ; GFX11-NEXT: v_cls_i32_e32 v10, v7
32356 ; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8
32357 ; GFX11-NEXT: v_add_nc_u32_e32 v9, -1, v9
32358 ; GFX11-NEXT: v_cls_i32_e32 v12, v1
32359 ; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3
32360 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
32361 ; GFX11-NEXT: v_add_nc_u32_e32 v8, 32, v8
32362 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
32363 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
32364 ; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v14
32365 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
32366 ; GFX11-NEXT: v_min_u32_e32 v8, v9, v8
32367 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v13
32368 ; GFX11-NEXT: v_cls_i32_e32 v13, v3
32369 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 32, v14
32370 ; GFX11-NEXT: v_min_u32_e32 v10, v10, v11
32371 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
32372 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
32373 ; GFX11-NEXT: v_add_nc_u32_e32 v13, -1, v13
32374 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
32375 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
32376 ; GFX11-NEXT: v_min_u32_e32 v9, v12, v9
32377 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32378 ; GFX11-NEXT: v_min_u32_e32 v11, v13, v14
32379 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
32380 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
32381 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
32382 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
32383 ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
32384 ; GFX11-NEXT: v_min_u32_e32 v5, 1, v6
32385 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v8
32386 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
32387 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
32388 ; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4
32389 ; GFX11-NEXT: v_or_b32_e32 v5, v7, v5
32390 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32391 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
32392 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
32393 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32394 ; GFX11-NEXT: v_ldexp_f32 v2, v4, v6
32395 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v5
32396 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v10
32397 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
32398 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9
32399 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
32400 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11
32401 ; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
32402 ; GFX11-NEXT: v_ldexp_f32 v3, v3, v4
32403 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5
32404 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
32405 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v6
32406 ; GFX11-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
32407 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
32408 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
32409 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32410 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
32411 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
32412 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32413 ; GFX11-NEXT: v_add3_u32 v4, v6, v3, 0x7fff
32414 ; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
32415 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
32416 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32417 ; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
32418 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
32419 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32420 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
32421 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32422 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
32423 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32424 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32425 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
32426 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
32427 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
32428 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32429 %op = sitofp <4 x i64> %x to <4 x bfloat>
32430 ret <4 x bfloat> %op
32433 define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
32434 ; GCN-LABEL: v_uitofp_i16_to_bf16:
32436 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32437 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32438 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32439 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32440 ; GCN-NEXT: s_setpc_b64 s[30:31]
32442 ; GFX7-LABEL: v_uitofp_i16_to_bf16:
32444 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32445 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32446 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32447 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32448 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32450 ; GFX8-LABEL: v_uitofp_i16_to_bf16:
32452 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32453 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32454 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
32455 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
32456 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
32457 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
32458 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32459 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
32460 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32461 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32463 ; GFX9-LABEL: v_uitofp_i16_to_bf16:
32465 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32466 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32467 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32468 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
32469 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
32470 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
32471 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32472 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
32473 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32474 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32476 ; GFX10-LABEL: v_uitofp_i16_to_bf16:
32478 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32479 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32480 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
32481 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
32482 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32483 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
32484 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
32485 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32486 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32488 ; GFX11-LABEL: v_uitofp_i16_to_bf16:
32490 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32491 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
32492 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32493 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
32494 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
32495 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
32496 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32497 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
32498 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
32499 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
32500 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
32501 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32502 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32503 %op = uitofp i16 %x to bfloat
32507 define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
32508 ; GCN-LABEL: v_uitofp_v2i16_to_v2bf16:
32510 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32511 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32512 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
32513 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
32514 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32515 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32516 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32517 ; GCN-NEXT: s_setpc_b64 s[30:31]
32519 ; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
32521 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32522 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32523 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32524 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32525 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
32526 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32527 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32528 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32530 ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
32532 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32533 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32534 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32535 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
32536 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
32537 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32538 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
32539 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32540 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
32541 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
32542 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
32543 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32544 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
32545 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32546 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32547 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32548 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
32549 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32551 ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
32553 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32554 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32555 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32556 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32557 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
32558 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
32559 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
32560 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32561 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
32562 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
32563 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
32564 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
32565 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32566 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
32567 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32568 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
32569 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32571 ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
32573 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32574 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32575 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32576 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
32577 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
32578 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
32579 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32580 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
32581 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
32582 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
32583 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
32584 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32585 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
32586 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
32587 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32589 ; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
32591 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32592 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0
32593 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32595 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
32596 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
32597 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32598 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
32599 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
32600 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
32601 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32602 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
32603 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
32604 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
32605 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32606 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
32607 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32608 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
32609 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
32610 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
32611 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32612 %op = uitofp <2 x i16> %x to <2 x bfloat>
32613 ret <2 x bfloat> %op
32616 define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
32617 ; GCN-LABEL: v_uitofp_v3i16_to_v3bf16:
32619 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32620 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32621 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
32622 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
32623 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
32624 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
32625 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32626 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32627 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32628 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32629 ; GCN-NEXT: s_setpc_b64 s[30:31]
32631 ; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
32633 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32634 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32635 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32636 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
32637 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32638 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
32639 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
32640 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32641 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32642 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32643 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32645 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
32647 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32648 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32649 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32650 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32651 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
32652 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
32653 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
32654 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
32655 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
32656 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32657 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
32658 ; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1
32659 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
32660 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
32661 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4
32662 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32663 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
32664 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
32665 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
32666 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
32667 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
32668 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32669 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
32670 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32671 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
32672 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
32673 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32675 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
32677 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32678 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32679 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32680 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32681 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32682 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
32683 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
32684 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
32685 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32686 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
32687 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
32688 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
32689 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
32690 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
32691 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
32692 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
32693 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
32694 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
32695 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32696 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
32697 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32698 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
32699 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
32700 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32702 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
32704 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32705 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32706 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32707 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32708 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
32709 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
32710 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
32711 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32712 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
32713 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
32714 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
32715 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
32716 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
32717 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
32718 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32719 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32720 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32721 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32722 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
32723 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32724 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
32725 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32727 ; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16:
32728 ; GFX11TRUE16: ; %bb.0:
32729 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32730 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
32731 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32732 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
32733 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
32734 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
32735 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
32736 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
32737 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
32738 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32739 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
32740 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32741 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
32742 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
32743 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
32744 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
32745 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
32746 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32747 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
32748 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
32749 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
32750 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32751 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32752 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32753 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32754 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32755 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32756 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
32757 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32758 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
32759 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
32761 ; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
32762 ; GFX11FAKE16: ; %bb.0:
32763 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32764 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
32765 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32766 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
32767 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
32768 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
32769 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
32770 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
32771 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
32772 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32773 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
32774 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32775 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
32776 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
32777 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
32778 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
32779 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
32780 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32781 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
32782 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
32783 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
32784 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32785 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
32786 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32787 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
32788 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32789 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32790 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
32791 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
32792 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
32793 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
32794 %op = uitofp <3 x i16> %x to <3 x bfloat>
32795 ret <3 x bfloat> %op
32798 define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
32799 ; GCN-LABEL: v_uitofp_v4i16_to_v4bf16:
32801 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32802 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
32803 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
32804 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
32805 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
32806 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
32807 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
32808 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
32809 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32810 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32811 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32812 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32813 ; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
32814 ; GCN-NEXT: s_setpc_b64 s[30:31]
32816 ; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
32818 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32819 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32820 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32821 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
32822 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
32823 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32824 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
32825 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
32826 ; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
32827 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32828 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
32829 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
32830 ; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
32831 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32833 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
32835 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32836 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32837 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32838 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32839 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
32840 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
32841 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
32842 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
32843 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
32844 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
32845 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
32846 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
32847 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
32848 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
32849 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
32850 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32851 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
32852 ; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1
32853 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32854 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
32855 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
32856 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5
32857 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
32858 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
32859 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
32860 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
32861 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
32862 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
32863 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32864 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
32865 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
32866 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32867 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
32868 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
32869 ; GFX8-NEXT: s_setpc_b64 s[30:31]
32871 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
32873 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32874 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32875 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
32876 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32877 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
32878 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
32879 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
32880 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
32881 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
32882 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32883 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32884 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
32885 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
32886 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
32887 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
32888 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
32889 ; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
32890 ; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
32891 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
32892 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
32893 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
32894 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
32895 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
32896 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
32897 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
32898 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
32899 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
32900 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
32901 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
32902 ; GFX9-NEXT: s_setpc_b64 s[30:31]
32904 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
32906 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32907 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32908 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
32909 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32910 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
32911 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
32912 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
32913 ; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
32914 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32915 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
32916 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
32917 ; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
32918 ; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
32919 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
32920 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
32921 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32922 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32923 ; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
32924 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
32925 ; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
32926 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
32927 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32928 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
32929 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32930 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
32931 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
32932 ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
32933 ; GFX10-NEXT: s_setpc_b64 s[30:31]
32935 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
32937 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32938 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1
32939 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
32940 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
32941 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
32942 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
32943 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
32944 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
32945 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
32946 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
32947 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
32948 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
32949 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
32950 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0
32951 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
32952 ; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
32953 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
32954 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
32955 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
32956 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
32957 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
32958 ; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
32959 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
32960 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
32961 ; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1
32962 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
32963 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0
32964 ; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
32965 ; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
32966 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
32967 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
32968 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
32969 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
32970 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
32971 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
32972 ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
32973 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
32974 ; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
32975 ; GFX11-NEXT: s_setpc_b64 s[30:31]
32976 %op = uitofp <4 x i16> %x to <4 x bfloat>
32977 ret <4 x bfloat> %op
32980 define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
32981 ; GCN-LABEL: v_uitofp_i32_to_bf16:
32983 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32984 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
32985 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32986 ; GCN-NEXT: s_setpc_b64 s[30:31]
32988 ; GFX7-LABEL: v_uitofp_i32_to_bf16:
32990 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32991 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
32992 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
32993 ; GFX7-NEXT: s_setpc_b64 s[30:31]
32995 ; GFX8-LABEL: v_uitofp_i32_to_bf16:
32997 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32998 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
32999 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
33000 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
33001 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
33002 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
33003 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33004 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
33005 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33006 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33008 ; GFX9-LABEL: v_uitofp_i32_to_bf16:
33010 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33011 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33012 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33013 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
33014 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
33015 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
33016 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33017 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
33018 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33019 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33021 ; GFX10-LABEL: v_uitofp_i32_to_bf16:
33023 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33024 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33025 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
33026 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
33027 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33028 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33029 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33030 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33031 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33033 ; GFX11-LABEL: v_uitofp_i32_to_bf16:
33035 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33036 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33037 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33038 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
33039 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
33040 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33041 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33042 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33043 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33044 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33045 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33046 %op = uitofp i32 %x to bfloat
33050 define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
33051 ; GCN-LABEL: v_uitofp_v2i32_to_v2bf16:
33053 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33054 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
33055 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33056 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33057 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33058 ; GCN-NEXT: s_setpc_b64 s[30:31]
33060 ; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
33062 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33063 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33064 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
33065 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33066 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33067 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33069 ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
33071 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33072 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33073 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
33074 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
33075 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
33076 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
33077 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
33078 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33079 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
33080 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
33081 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
33082 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
33083 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
33084 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33085 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
33086 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
33087 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
33088 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33090 ; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
33092 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33093 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33094 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
33095 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33096 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
33097 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
33098 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
33099 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33100 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33101 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
33102 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
33103 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
33104 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33105 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
33106 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33107 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
33108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33110 ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
33112 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33113 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33114 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33115 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
33116 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
33117 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
33118 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33119 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
33120 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33121 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33122 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33123 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33124 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33125 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33126 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33128 ; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
33130 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33131 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33132 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
33133 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33134 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
33135 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
33136 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
33137 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33138 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
33139 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33140 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33141 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
33142 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33143 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33144 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
33146 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33147 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33148 %op = uitofp <2 x i32> %x to <2 x bfloat>
33149 ret <2 x bfloat> %op
33152 define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
33153 ; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
33155 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33156 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
33157 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
33158 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33159 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33160 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33161 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33162 ; GCN-NEXT: s_setpc_b64 s[30:31]
33164 ; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
33166 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33167 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33168 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
33169 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33170 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33171 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33172 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33173 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33175 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
33177 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33178 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
33179 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33180 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
33181 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
33182 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
33183 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
33184 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
33185 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33186 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
33187 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
33188 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
33189 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
33190 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
33191 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33192 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
33193 ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
33194 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
33195 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
33196 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
33197 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33198 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
33199 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
33200 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
33201 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
33202 ; GFX8-NEXT: v_mov_b32_e32 v1, v2
33203 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33205 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
33207 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33208 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
33209 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33210 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33211 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
33212 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
33213 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
33214 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
33215 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33216 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
33217 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
33218 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
33219 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
33220 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33221 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
33222 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
33223 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
33224 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
33225 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33226 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
33227 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33228 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
33229 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
33230 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33232 ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
33234 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33235 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33236 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33237 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
33238 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
33239 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
33240 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
33241 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33242 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
33243 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33244 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
33245 ; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
33246 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
33247 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33248 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33249 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33250 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33251 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33252 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33253 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33254 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
33255 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33257 ; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16:
33258 ; GFX11TRUE16: ; %bb.0:
33259 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33260 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
33261 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
33262 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
33263 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
33264 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
33265 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
33266 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
33267 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33268 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
33269 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33270 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
33271 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
33272 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
33273 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33274 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33275 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33276 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33277 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33278 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
33279 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33280 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33281 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
33282 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
33284 ; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
33285 ; GFX11FAKE16: ; %bb.0:
33286 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33287 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
33288 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
33289 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
33290 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
33291 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
33292 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
33293 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
33294 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33295 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
33296 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33297 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
33298 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
33299 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
33300 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33301 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33302 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33303 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
33304 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33305 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
33306 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33307 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
33308 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
33309 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
33310 %op = uitofp <3 x i32> %x to <3 x bfloat>
33311 ret <3 x bfloat> %op
33314 define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
33315 ; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
33317 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33318 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
33319 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
33320 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
33321 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33322 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33323 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33324 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33325 ; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
33326 ; GCN-NEXT: s_setpc_b64 s[30:31]
33328 ; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
33330 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33331 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33332 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
33333 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33334 ; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
33335 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
33336 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
33337 ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
33338 ; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
33339 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33341 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
33343 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33344 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
33345 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3
33346 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33347 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
33348 ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
33349 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
33350 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
33351 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
33352 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33353 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
33354 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
33355 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
33356 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
33357 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3
33358 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
33359 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
33360 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
33361 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
33362 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
33363 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
33364 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
33365 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33366 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
33367 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
33368 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
33369 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
33370 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
33371 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33372 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
33373 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
33374 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
33375 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
33376 ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
33377 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33379 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
33381 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33382 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
33383 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
33384 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33385 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33386 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
33387 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
33388 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
33389 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33390 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
33391 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
33392 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
33393 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
33394 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
33395 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
33396 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
33397 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
33398 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
33399 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
33400 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33401 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
33402 ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
33403 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
33404 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
33405 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
33406 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
33407 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33408 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
33409 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
33410 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33412 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
33414 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33415 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
33416 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33417 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33418 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
33419 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
33420 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
33421 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
33422 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33423 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
33424 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33425 ; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
33426 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
33427 ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
33428 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
33429 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
33430 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33431 ; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
33432 ; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
33433 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
33434 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
33435 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33436 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
33437 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
33438 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33439 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
33440 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
33441 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33443 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
33445 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33446 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
33447 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33448 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
33449 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
33450 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
33451 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
33452 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
33453 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
33454 ; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1
33455 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33456 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
33457 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
33458 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
33459 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
33460 ; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
33461 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
33462 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33463 ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
33464 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
33465 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
33466 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33467 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
33468 ; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
33469 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
33470 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
33471 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33472 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
33473 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
33474 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
33475 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33476 %op = uitofp <4 x i32> %x to <4 x bfloat>
33477 ret <4 x bfloat> %op
33480 define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
33481 ; GCN-LABEL: v_uitofp_i64_to_bf16:
33483 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33484 ; GCN-NEXT: v_ffbh_u32_e32 v2, v1
33485 ; GCN-NEXT: v_min_u32_e32 v2, 32, v2
33486 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
33487 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
33488 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
33489 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33490 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
33491 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
33492 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33493 ; GCN-NEXT: s_setpc_b64 s[30:31]
33495 ; GFX7-LABEL: v_uitofp_i64_to_bf16:
33497 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33498 ; GFX7-NEXT: v_ffbh_u32_e32 v2, v1
33499 ; GFX7-NEXT: v_min_u32_e32 v2, 32, v2
33500 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
33501 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
33502 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
33503 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33504 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
33505 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
33506 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33507 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33509 ; GFX8-LABEL: v_uitofp_i64_to_bf16:
33511 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33512 ; GFX8-NEXT: v_ffbh_u32_e32 v2, v1
33513 ; GFX8-NEXT: v_min_u32_e32 v2, 32, v2
33514 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33515 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33516 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33517 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33518 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
33519 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
33520 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
33521 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
33522 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
33523 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
33524 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33525 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
33526 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33527 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33529 ; GFX9-LABEL: v_uitofp_i64_to_bf16:
33531 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33532 ; GFX9-NEXT: v_ffbh_u32_e32 v2, v1
33533 ; GFX9-NEXT: v_min_u32_e32 v2, 32, v2
33534 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33535 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33536 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33537 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33538 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33539 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
33540 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
33541 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
33542 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
33543 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
33544 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33545 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
33546 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33547 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33549 ; GFX10-LABEL: v_uitofp_i64_to_bf16:
33551 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33552 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
33553 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
33554 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33555 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
33556 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
33557 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
33558 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33559 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
33560 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
33561 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
33562 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33563 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33564 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33565 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33566 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33568 ; GFX11-LABEL: v_uitofp_i64_to_bf16:
33570 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33571 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
33572 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33573 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
33574 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
33575 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33576 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
33577 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
33578 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
33579 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
33580 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33581 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
33582 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
33583 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
33584 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
33585 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33586 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
33587 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33588 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
33589 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33590 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33591 %op = uitofp i64 %x to bfloat
33595 define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
33596 ; GCN-LABEL: v_uitofp_v2i64_to_v2bf16:
33598 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33599 ; GCN-NEXT: v_ffbh_u32_e32 v4, v3
33600 ; GCN-NEXT: v_ffbh_u32_e32 v5, v1
33601 ; GCN-NEXT: v_min_u32_e32 v4, 32, v4
33602 ; GCN-NEXT: v_min_u32_e32 v5, 32, v5
33603 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
33604 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
33605 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
33606 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5
33607 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
33608 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
33609 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
33610 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
33611 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2
33612 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33613 ; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
33614 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
33615 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33616 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33617 ; GCN-NEXT: s_setpc_b64 s[30:31]
33619 ; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16:
33621 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33622 ; GFX7-NEXT: v_ffbh_u32_e32 v4, v3
33623 ; GFX7-NEXT: v_min_u32_e32 v4, 32, v4
33624 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
33625 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
33626 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
33627 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
33628 ; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
33629 ; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
33630 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
33631 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33632 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
33633 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
33634 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33635 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
33636 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
33637 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
33638 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33639 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33640 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33642 ; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
33644 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33645 ; GFX8-NEXT: v_ffbh_u32_e32 v4, v1
33646 ; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
33647 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33648 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33649 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33650 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33651 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
33652 ; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
33653 ; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
33654 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
33655 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0
33656 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
33657 ; GFX8-NEXT: v_min_u32_e32 v6, 32, v0
33658 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
33659 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
33660 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33661 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33662 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33663 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33664 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
33665 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
33666 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
33667 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
33668 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
33669 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
33670 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
33671 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33672 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33673 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
33674 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
33675 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33677 ; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
33679 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33680 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v1
33681 ; GFX9-NEXT: v_min_u32_e32 v4, 32, v4
33682 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33683 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33684 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33685 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33686 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33687 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
33688 ; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
33689 ; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
33690 ; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
33691 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
33692 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v0
33693 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
33694 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
33695 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33696 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33697 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33698 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33699 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
33700 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
33701 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
33702 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
33703 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
33704 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
33705 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33706 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33707 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33708 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
33709 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33711 ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
33713 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33714 ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1
33715 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
33716 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4
33717 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5
33718 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33719 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
33720 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
33721 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
33722 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
33723 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
33724 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4
33725 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5
33726 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33727 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33728 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
33729 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
33730 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
33731 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
33732 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
33733 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33734 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
33735 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33736 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33737 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33738 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33739 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33740 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33741 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33743 ; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
33745 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33746 ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1
33747 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3
33748 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33749 ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4
33750 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5
33751 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33752 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
33753 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
33754 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33755 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
33756 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
33757 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33758 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
33759 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
33760 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4
33761 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5
33762 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
33763 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
33764 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
33765 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33766 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
33767 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
33768 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
33769 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
33770 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
33771 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
33772 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33773 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
33774 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
33775 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
33776 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
33777 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
33778 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33779 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
33780 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
33781 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
33782 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33783 %op = uitofp <2 x i64> %x to <2 x bfloat>
33784 ret <2 x bfloat> %op
33787 define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
33788 ; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
33790 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33791 ; GCN-NEXT: v_ffbh_u32_e32 v6, v5
33792 ; GCN-NEXT: v_ffbh_u32_e32 v7, v3
33793 ; GCN-NEXT: v_ffbh_u32_e32 v8, v1
33794 ; GCN-NEXT: v_min_u32_e32 v6, 32, v6
33795 ; GCN-NEXT: v_min_u32_e32 v7, 32, v7
33796 ; GCN-NEXT: v_min_u32_e32 v8, 32, v8
33797 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
33798 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
33799 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
33800 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
33801 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
33802 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
33803 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
33804 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
33805 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
33806 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
33807 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
33808 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
33809 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4
33810 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
33811 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
33812 ; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
33813 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
33814 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
33815 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33816 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33817 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
33818 ; GCN-NEXT: s_setpc_b64 s[30:31]
33820 ; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16:
33822 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33823 ; GFX7-NEXT: v_ffbh_u32_e32 v6, v5
33824 ; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
33825 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
33826 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
33827 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
33828 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
33829 ; GFX7-NEXT: v_ffbh_u32_e32 v6, v3
33830 ; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
33831 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
33832 ; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
33833 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
33834 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
33835 ; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
33836 ; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
33837 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
33838 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
33839 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
33840 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
33841 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
33842 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
33843 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
33844 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
33845 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
33846 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
33847 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
33848 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
33849 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
33850 ; GFX7-NEXT: s_setpc_b64 s[30:31]
33852 ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
33854 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33855 ; GFX8-NEXT: v_ffbh_u32_e32 v6, v5
33856 ; GFX8-NEXT: v_min_u32_e32 v6, 32, v6
33857 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
33858 ; GFX8-NEXT: v_ffbh_u32_e32 v7, v1
33859 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
33860 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
33861 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
33862 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
33863 ; GFX8-NEXT: v_min_u32_e32 v7, 32, v7
33864 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
33865 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
33866 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
33867 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
33868 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
33869 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
33870 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
33871 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
33872 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33873 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
33874 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
33875 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
33876 ; GFX8-NEXT: v_ffbh_u32_e32 v5, v3
33877 ; GFX8-NEXT: v_min_u32_e32 v5, 32, v5
33878 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
33879 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
33880 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7
33881 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
33882 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
33883 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
33884 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
33885 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
33886 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
33887 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
33888 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
33889 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33890 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
33891 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
33892 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
33893 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
33894 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
33895 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
33896 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
33897 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
33898 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
33899 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
33900 ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
33901 ; GFX8-NEXT: s_setpc_b64 s[30:31]
33903 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
33905 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33906 ; GFX9-NEXT: v_ffbh_u32_e32 v6, v5
33907 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
33908 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
33909 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
33910 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
33911 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
33912 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
33913 ; GFX9-NEXT: v_ffbh_u32_e32 v6, v1
33914 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
33915 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
33916 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
33917 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33918 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33919 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33920 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
33921 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
33922 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
33923 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
33924 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
33925 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
33926 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
33927 ; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
33928 ; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
33929 ; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
33930 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
33931 ; GFX9-NEXT: v_min_u32_e32 v7, 32, v0
33932 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
33933 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
33934 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
33935 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
33936 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
33937 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
33938 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
33939 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
33940 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
33941 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
33942 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
33943 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
33944 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
33945 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
33946 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
33947 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
33948 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
33949 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33951 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
33953 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33954 ; GFX10-NEXT: v_ffbh_u32_e32 v6, v1
33955 ; GFX10-NEXT: v_ffbh_u32_e32 v8, v3
33956 ; GFX10-NEXT: v_ffbh_u32_e32 v7, v5
33957 ; GFX10-NEXT: v_min_u32_e32 v6, 32, v6
33958 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
33959 ; GFX10-NEXT: v_min_u32_e32 v7, 32, v7
33960 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
33961 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
33962 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
33963 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v7
33964 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
33965 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
33966 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
33967 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
33968 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
33969 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6
33970 ; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
33971 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v8
33972 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
33973 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
33974 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
33975 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
33976 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v4
33977 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v7
33978 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
33979 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
33980 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
33981 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
33982 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
33983 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
33984 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
33985 ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
33986 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
33987 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
33988 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
33989 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
33990 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
33991 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
33992 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
33993 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
33994 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
33995 ; GFX10-NEXT: s_setpc_b64 s[30:31]
33997 ; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16:
33998 ; GFX11TRUE16: ; %bb.0:
33999 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34000 ; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v6, v1
34001 ; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v7, v5
34002 ; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v8, v3
34003 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34004 ; GFX11TRUE16-NEXT: v_min_u32_e32 v6, 32, v6
34005 ; GFX11TRUE16-NEXT: v_min_u32_e32 v7, 32, v7
34006 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34007 ; GFX11TRUE16-NEXT: v_min_u32_e32 v8, 32, v8
34008 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
34009 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34010 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
34011 ; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
34012 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7
34013 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34014 ; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
34015 ; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
34016 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
34017 ; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
34018 ; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
34019 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34020 ; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v4
34021 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
34022 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
34023 ; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8
34024 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
34025 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
34026 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
34027 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34028 ; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
34029 ; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v7
34030 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34031 ; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
34032 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
34033 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34034 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
34035 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
34036 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
34037 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34038 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
34039 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
34040 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
34041 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
34042 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
34043 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
34044 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34045 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
34046 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
34047 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34048 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34049 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
34050 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34051 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
34052 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34054 ; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
34055 ; GFX11FAKE16: ; %bb.0:
34056 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34057 ; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v6, v1
34058 ; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v7, v5
34059 ; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v8, v3
34060 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34061 ; GFX11FAKE16-NEXT: v_min_u32_e32 v6, 32, v6
34062 ; GFX11FAKE16-NEXT: v_min_u32_e32 v7, 32, v7
34063 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34064 ; GFX11FAKE16-NEXT: v_min_u32_e32 v8, 32, v8
34065 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
34066 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34067 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
34068 ; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
34069 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7
34070 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34071 ; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
34072 ; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
34073 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
34074 ; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
34075 ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
34076 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34077 ; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
34078 ; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
34079 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
34080 ; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8
34081 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
34082 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
34083 ; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
34084 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34085 ; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
34086 ; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v7
34087 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34088 ; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
34089 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
34090 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
34091 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
34092 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
34093 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
34094 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34095 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
34096 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
34097 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
34098 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
34099 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
34100 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
34101 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34102 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
34103 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
34104 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34105 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
34106 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
34107 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
34108 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
34109 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34110 %op = uitofp <3 x i64> %x to <3 x bfloat>
34111 ret <3 x bfloat> %op
34114 define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
34115 ; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
34117 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34118 ; GCN-NEXT: v_ffbh_u32_e32 v8, v7
34119 ; GCN-NEXT: v_ffbh_u32_e32 v9, v5
34120 ; GCN-NEXT: v_ffbh_u32_e32 v10, v3
34121 ; GCN-NEXT: v_ffbh_u32_e32 v11, v1
34122 ; GCN-NEXT: v_min_u32_e32 v8, 32, v8
34123 ; GCN-NEXT: v_min_u32_e32 v9, 32, v9
34124 ; GCN-NEXT: v_min_u32_e32 v10, 32, v10
34125 ; GCN-NEXT: v_min_u32_e32 v11, 32, v11
34126 ; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
34127 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
34128 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
34129 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
34130 ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
34131 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
34132 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
34133 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
34134 ; GCN-NEXT: v_min_u32_e32 v6, 1, v6
34135 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4
34136 ; GCN-NEXT: v_min_u32_e32 v2, 1, v2
34137 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0
34138 ; GCN-NEXT: v_or_b32_e32 v6, v7, v6
34139 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4
34140 ; GCN-NEXT: v_or_b32_e32 v2, v3, v2
34141 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
34142 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6
34143 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4
34144 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
34145 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
34146 ; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
34147 ; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
34148 ; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
34149 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
34150 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34151 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34152 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
34153 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
34154 ; GCN-NEXT: s_setpc_b64 s[30:31]
34156 ; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16:
34158 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34159 ; GFX7-NEXT: v_ffbh_u32_e32 v8, v7
34160 ; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
34161 ; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
34162 ; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
34163 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
34164 ; GFX7-NEXT: v_cvt_f32_u32_e32 v6, v6
34165 ; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
34166 ; GFX7-NEXT: v_ffbh_u32_e32 v8, v5
34167 ; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
34168 ; GFX7-NEXT: v_ffbh_u32_e32 v7, v3
34169 ; GFX7-NEXT: v_min_u32_e32 v7, 32, v7
34170 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
34171 ; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
34172 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
34173 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
34174 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
34175 ; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
34176 ; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
34177 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
34178 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
34179 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
34180 ; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
34181 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
34182 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
34183 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
34184 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
34185 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
34186 ; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
34187 ; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
34188 ; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
34189 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
34190 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
34191 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34192 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34193 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
34194 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
34195 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34197 ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
34199 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34200 ; GFX8-NEXT: v_ffbh_u32_e32 v8, v5
34201 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
34202 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34203 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
34204 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
34205 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
34206 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
34207 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
34208 ; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
34209 ; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
34210 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
34211 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4
34212 ; GFX8-NEXT: v_ffbh_u32_e32 v4, v7
34213 ; GFX8-NEXT: v_min_u32_e32 v10, 32, v4
34214 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
34215 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
34216 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
34217 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
34218 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
34219 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
34220 ; GFX8-NEXT: v_ffbh_u32_e32 v8, v1
34221 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
34222 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
34223 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
34224 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
34225 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
34226 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
34227 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
34228 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
34229 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
34230 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
34231 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
34232 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
34233 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
34234 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
34235 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
34236 ; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
34237 ; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
34238 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
34239 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0
34240 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
34241 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v0
34242 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
34243 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6
34244 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
34245 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
34246 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
34247 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
34248 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
34249 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
34250 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
34251 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
34252 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
34253 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
34254 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
34255 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
34256 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
34257 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34258 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
34259 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
34260 ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
34261 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34263 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
34265 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34266 ; GFX9-NEXT: v_ffbh_u32_e32 v8, v5
34267 ; GFX9-NEXT: v_min_u32_e32 v8, 32, v8
34268 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34269 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
34270 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
34271 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
34272 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
34273 ; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
34274 ; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
34275 ; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
34276 ; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
34277 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v7
34278 ; GFX9-NEXT: v_min_u32_e32 v10, 32, v4
34279 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
34280 ; GFX9-NEXT: v_ffbh_u32_e32 v7, v1
34281 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
34282 ; GFX9-NEXT: v_min_u32_e32 v7, 32, v7
34283 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
34284 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
34285 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
34286 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
34287 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
34288 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
34289 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
34290 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
34291 ; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
34292 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
34293 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
34294 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
34295 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
34296 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
34297 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
34298 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
34299 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
34300 ; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
34301 ; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
34302 ; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
34303 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
34304 ; GFX9-NEXT: v_min_u32_e32 v8, 32, v0
34305 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
34306 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
34307 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
34308 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
34309 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
34310 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
34311 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
34312 ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
34313 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
34314 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
34315 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
34316 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
34317 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
34318 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
34319 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
34320 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
34321 ; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
34322 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34324 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
34326 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34327 ; GFX10-NEXT: v_ffbh_u32_e32 v8, v5
34328 ; GFX10-NEXT: v_ffbh_u32_e32 v10, v1
34329 ; GFX10-NEXT: v_ffbh_u32_e32 v11, v3
34330 ; GFX10-NEXT: v_ffbh_u32_e32 v9, v7
34331 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
34332 ; GFX10-NEXT: v_min_u32_e32 v10, 32, v10
34333 ; GFX10-NEXT: v_min_u32_e32 v11, 32, v11
34334 ; GFX10-NEXT: v_min_u32_e32 v9, 32, v9
34335 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34336 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
34337 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
34338 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
34339 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8
34340 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 32, v9
34341 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
34342 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
34343 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
34344 ; GFX10-NEXT: v_min_u32_e32 v6, 1, v6
34345 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
34346 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
34347 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
34348 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v10
34349 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v11
34350 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v4
34351 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
34352 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
34353 ; GFX10-NEXT: v_or_b32_e32 v6, v7, v6
34354 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v8
34355 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
34356 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
34357 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v6
34358 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
34359 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
34360 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
34361 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34362 ; GFX10-NEXT: v_ldexp_f32 v4, v4, v9
34363 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
34364 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
34365 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
34366 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
34367 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
34368 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
34369 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34370 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
34371 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v1
34372 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
34373 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
34374 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
34375 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34376 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
34377 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
34378 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
34379 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
34380 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
34381 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34383 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
34385 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34386 ; GFX11-NEXT: v_clz_i32_u32_e32 v8, v5
34387 ; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1
34388 ; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3
34389 ; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7
34390 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34391 ; GFX11-NEXT: v_min_u32_e32 v8, 32, v8
34392 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10
34393 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34394 ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11
34395 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9
34396 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34397 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
34398 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
34399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34400 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
34401 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
34402 ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 32, v8
34403 ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
34404 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
34405 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
34406 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
34407 ; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
34408 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34409 ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4
34410 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
34411 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
34412 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
34413 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v10
34414 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v11
34415 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v4
34416 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
34417 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
34418 ; GFX11-NEXT: v_or_b32_e32 v6, v7, v6
34419 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34420 ; GFX11-NEXT: v_ldexp_f32 v2, v2, v8
34421 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5
34422 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
34423 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
34424 ; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v6
34425 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
34426 ; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
34427 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
34428 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
34429 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
34430 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
34431 ; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
34432 ; GFX11-NEXT: v_ldexp_f32 v4, v4, v9
34433 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
34434 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
34435 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
34436 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
34437 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
34438 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
34439 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v1
34440 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
34441 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
34442 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
34443 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
34444 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
34445 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
34446 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
34447 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
34448 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
34449 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
34450 ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
34451 ; GFX11-NEXT: s_setpc_b64 s[30:31]
34452 %op = uitofp <4 x i64> %x to <4 x bfloat>
34453 ret <4 x bfloat> %op
34456 define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
34457 ; GCN-LABEL: v_select_bf16:
34459 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34460 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
34461 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34462 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34463 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34464 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34465 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34466 ; GCN-NEXT: s_setpc_b64 s[30:31]
34468 ; GFX7-LABEL: v_select_bf16:
34470 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34471 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34472 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
34473 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34474 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34475 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34476 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34477 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34479 ; GFX8-LABEL: v_select_bf16:
34481 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34482 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34483 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34484 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34485 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34487 ; GFX9-LABEL: v_select_bf16:
34489 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34490 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34491 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34492 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34493 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34495 ; GFX10-LABEL: v_select_bf16:
34497 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34498 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34499 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34500 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34501 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34503 ; GFX11-LABEL: v_select_bf16:
34505 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34506 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
34507 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
34508 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34509 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34510 ; GFX11-NEXT: s_setpc_b64 s[30:31]
34511 %op = select i1 %cond, bfloat %a, bfloat %b
34515 define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
34516 ; GCN-LABEL: v_select_fneg_lhs_bf16:
34518 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34519 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34520 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34521 ; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
34522 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34523 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34524 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34525 ; GCN-NEXT: s_setpc_b64 s[30:31]
34527 ; GFX7-LABEL: v_select_fneg_lhs_bf16:
34529 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34530 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34531 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34532 ; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
34533 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34534 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34535 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34536 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34538 ; GFX8-LABEL: v_select_fneg_lhs_bf16:
34540 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34541 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34542 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34543 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34544 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34545 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34547 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
34549 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34550 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34551 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34552 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34553 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34554 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34556 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
34558 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34559 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34560 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34561 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34562 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34563 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34565 ; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
34566 ; GFX11TRUE16: ; %bb.0:
34567 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34568 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34569 ; GFX11TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
34570 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34571 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34572 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34573 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34575 ; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
34576 ; GFX11FAKE16: ; %bb.0:
34577 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34578 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34579 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34580 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34581 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34582 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34583 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34584 %neg.a = fneg bfloat %a
34585 %op = select i1 %cond, bfloat %neg.a, bfloat %b
34589 define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
34590 ; GCN-LABEL: v_select_fneg_rhs_bf16:
34592 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34593 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
34594 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34595 ; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2
34596 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34597 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34598 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34599 ; GCN-NEXT: s_setpc_b64 s[30:31]
34601 ; GFX7-LABEL: v_select_fneg_rhs_bf16:
34603 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34604 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34605 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
34606 ; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2
34607 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34608 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34609 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34610 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34612 ; GFX8-LABEL: v_select_fneg_rhs_bf16:
34614 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34615 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34616 ; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34617 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34618 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34619 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34621 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
34623 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34624 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34625 ; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34626 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34627 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34628 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34630 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
34632 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34633 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34634 ; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34635 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34636 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34637 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34639 ; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
34640 ; GFX11TRUE16: ; %bb.0:
34641 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34642 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34643 ; GFX11TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l
34644 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34645 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34646 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34647 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34649 ; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
34650 ; GFX11FAKE16: ; %bb.0:
34651 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34652 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34653 ; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34654 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34655 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34656 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34657 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34658 %neg.b = fneg bfloat %b
34659 %op = select i1 %cond, bfloat %a, bfloat %neg.b
34663 define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
34664 ; GCN-LABEL: v_select_v2bf16:
34666 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34667 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
34668 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
34669 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34670 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
34671 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34672 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34673 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34674 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34675 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34676 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34677 ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
34678 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
34679 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
34680 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
34681 ; GCN-NEXT: s_setpc_b64 s[30:31]
34683 ; GFX7-LABEL: v_select_v2bf16:
34685 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34686 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
34687 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
34688 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34689 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
34690 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34691 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34692 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34693 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34694 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34695 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34696 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
34697 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
34698 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
34699 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
34700 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34702 ; GFX8-LABEL: v_select_v2bf16:
34704 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34705 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34706 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34707 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34708 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34709 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34710 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
34711 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
34712 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34713 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34715 ; GFX9-LABEL: v_select_v2bf16:
34717 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34718 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34719 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34720 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34721 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34722 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34723 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
34724 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
34725 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
34726 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34728 ; GFX10-LABEL: v_select_v2bf16:
34730 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34731 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34732 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
34733 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34734 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34735 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34736 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo
34737 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34738 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34740 ; GFX11TRUE16-LABEL: v_select_v2bf16:
34741 ; GFX11TRUE16: ; %bb.0:
34742 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34743 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34744 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
34745 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
34746 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
34747 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34748 ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
34749 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34750 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34751 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34753 ; GFX11FAKE16-LABEL: v_select_v2bf16:
34754 ; GFX11FAKE16: ; %bb.0:
34755 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34756 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
34757 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34758 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34759 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
34760 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34761 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
34762 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34763 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34764 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34765 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
34766 ret <2 x bfloat> %op
34769 define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
34770 ; GCN-LABEL: v_vselect_v2bf16:
34772 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34773 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
34774 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
34775 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
34776 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
34777 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
34778 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
34779 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34780 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
34781 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34782 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
34783 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34784 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34785 ; GCN-NEXT: s_setpc_b64 s[30:31]
34787 ; GFX7-LABEL: v_vselect_v2bf16:
34789 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34790 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
34791 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
34792 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
34793 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
34794 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34795 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
34796 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
34797 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
34798 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34799 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
34800 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
34801 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
34802 ; GFX7-NEXT: s_setpc_b64 s[30:31]
34804 ; GFX8-LABEL: v_vselect_v2bf16:
34806 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34807 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
34808 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
34809 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34810 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
34811 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34812 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34813 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34814 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
34815 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
34816 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34817 ; GFX8-NEXT: s_setpc_b64 s[30:31]
34819 ; GFX9-LABEL: v_vselect_v2bf16:
34821 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34822 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
34823 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
34824 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
34825 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
34826 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34827 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34828 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
34829 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
34830 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
34831 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
34832 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34834 ; GFX10-LABEL: v_vselect_v2bf16:
34836 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34837 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
34838 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
34839 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34840 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
34841 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34842 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
34843 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
34844 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
34845 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34846 ; GFX10-NEXT: s_setpc_b64 s[30:31]
34848 ; GFX11TRUE16-LABEL: v_vselect_v2bf16:
34849 ; GFX11TRUE16: ; %bb.0:
34850 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34851 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
34852 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
34853 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34854 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
34855 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34856 ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
34857 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
34858 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
34859 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
34860 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34861 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34863 ; GFX11FAKE16-LABEL: v_vselect_v2bf16:
34864 ; GFX11FAKE16: ; %bb.0:
34865 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34866 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34867 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
34868 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
34869 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
34870 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34871 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
34872 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
34873 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
34874 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
34875 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
34876 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
34877 %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
34878 ret <2 x bfloat> %op
34881 define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
34882 ; GCN-LABEL: s_select_bf16:
34884 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
34885 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
34886 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34887 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34888 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
34889 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
34890 ; GCN-NEXT: ; return to shader part epilog
34892 ; GFX7-LABEL: s_select_bf16:
34894 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
34895 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
34896 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34897 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
34898 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
34899 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
34900 ; GFX7-NEXT: ; return to shader part epilog
34902 ; GFX8-LABEL: s_select_bf16:
34904 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
34905 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
34906 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34907 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
34908 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
34909 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
34910 ; GFX8-NEXT: ; return to shader part epilog
34912 ; GFX9-LABEL: s_select_bf16:
34914 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
34915 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
34916 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34917 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
34918 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
34919 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
34920 ; GFX9-NEXT: ; return to shader part epilog
34922 ; GFX10-LABEL: s_select_bf16:
34924 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
34925 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
34926 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
34927 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
34928 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
34929 ; GFX10-NEXT: ; return to shader part epilog
34931 ; GFX11-LABEL: s_select_bf16:
34933 ; GFX11-NEXT: v_mov_b32_e32 v1, s0
34934 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
34935 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
34936 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
34937 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
34938 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
34939 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
34940 ; GFX11-NEXT: ; return to shader part epilog
34941 %cond = icmp eq i32 %c, 0
34942 %op = select i1 %cond, bfloat %a, bfloat %b
34943 %cast = bitcast bfloat %op to i16
34944 %zext = zext i16 %cast to i32
34945 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
34949 define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
34950 ; GCN-LABEL: s_select_v2bf16:
34952 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
34953 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3
34954 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0
34955 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s2
34956 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34957 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34958 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34959 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34960 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34961 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
34962 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
34963 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
34964 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
34965 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
34966 ; GCN-NEXT: ; return to shader part epilog
34968 ; GFX7-LABEL: s_select_v2bf16:
34970 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
34971 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3
34972 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
34973 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
34974 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0
34975 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
34976 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34977 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
34978 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
34979 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
34980 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
34981 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
34982 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
34983 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
34984 ; GFX7-NEXT: ; return to shader part epilog
34986 ; GFX8-LABEL: s_select_v2bf16:
34988 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
34989 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
34990 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
34991 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
34992 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
34993 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
34994 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
34995 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
34996 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
34997 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
34998 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
34999 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
35000 ; GFX8-NEXT: ; return to shader part epilog
35002 ; GFX9-LABEL: s_select_v2bf16:
35004 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
35005 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
35006 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
35007 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
35008 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35009 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
35010 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
35011 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
35012 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
35013 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
35014 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0
35015 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
35016 ; GFX9-NEXT: ; return to shader part epilog
35018 ; GFX10-LABEL: s_select_v2bf16:
35020 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
35021 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
35022 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
35023 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35024 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
35025 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
35026 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo
35027 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35028 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
35029 ; GFX10-NEXT: ; return to shader part epilog
35031 ; GFX11TRUE16-LABEL: s_select_v2bf16:
35032 ; GFX11TRUE16: ; %bb.0:
35033 ; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16
35034 ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
35035 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
35036 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s3
35037 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s1
35038 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s0
35039 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35040 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
35041 ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
35042 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35043 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35044 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
35045 ; GFX11TRUE16-NEXT: ; return to shader part epilog
35047 ; GFX11FAKE16-LABEL: s_select_v2bf16:
35048 ; GFX11FAKE16: ; %bb.0:
35049 ; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16
35050 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35051 ; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
35052 ; GFX11FAKE16-NEXT: s_lshr_b32 s3, s1, 16
35053 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
35054 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
35055 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
35056 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo
35057 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35058 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35059 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
35060 ; GFX11FAKE16-NEXT: ; return to shader part epilog
35061 %cond = icmp eq i32 %c, 0
35062 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
35063 %cast = bitcast <2 x bfloat> %op to i32
35064 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
35068 define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
35069 ; GCN-LABEL: s_vselect_v2bf16:
35071 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
35072 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2
35073 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1
35074 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
35075 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35076 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
35077 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35078 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
35079 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35080 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35081 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
35082 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
35083 ; GCN-NEXT: ; return to shader part epilog
35085 ; GFX7-LABEL: s_vselect_v2bf16:
35087 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1
35088 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3
35089 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35090 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
35091 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
35092 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
35093 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35094 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
35095 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35096 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35097 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
35098 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
35099 ; GFX7-NEXT: ; return to shader part epilog
35101 ; GFX8-LABEL: s_vselect_v2bf16:
35103 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
35104 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
35105 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
35106 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
35107 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35108 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
35109 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
35110 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
35111 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35112 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35113 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
35114 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
35115 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
35116 ; GFX8-NEXT: ; return to shader part epilog
35118 ; GFX9-LABEL: s_vselect_v2bf16:
35120 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
35121 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
35122 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
35123 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
35124 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
35125 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
35126 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
35127 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
35128 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
35129 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
35130 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
35131 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
35132 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
35133 ; GFX9-NEXT: ; return to shader part epilog
35135 ; GFX10-LABEL: s_vselect_v2bf16:
35137 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
35138 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
35139 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
35140 ; GFX10-NEXT: v_mov_b32_e32 v3, s0
35141 ; GFX10-NEXT: s_lshr_b32 s0, s1, 16
35142 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
35143 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35144 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo
35145 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35146 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
35147 ; GFX10-NEXT: ; return to shader part epilog
35149 ; GFX11TRUE16-LABEL: s_vselect_v2bf16:
35150 ; GFX11TRUE16: ; %bb.0:
35151 ; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16
35152 ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
35153 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s2
35154 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s3
35155 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
35156 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
35157 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s0
35158 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
35159 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
35160 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35161 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
35162 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
35163 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35164 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
35165 ; GFX11TRUE16-NEXT: ; return to shader part epilog
35167 ; GFX11FAKE16-LABEL: s_vselect_v2bf16:
35168 ; GFX11FAKE16: ; %bb.0:
35169 ; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16
35170 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
35171 ; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
35172 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16
35173 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
35174 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
35175 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
35176 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
35177 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo
35178 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
35179 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35180 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
35181 ; GFX11FAKE16-NEXT: ; return to shader part epilog
35182 %cond = icmp eq <2 x i32> %c, zeroinitializer
35183 %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
35184 %cast = bitcast <2 x bfloat> %op to i32
35185 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
35189 define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
35190 ; GCN-LABEL: v_select_v3bf16:
35192 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35193 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35194 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35195 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35196 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35197 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35198 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35199 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35200 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35201 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35202 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35203 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
35204 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35205 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
35206 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35207 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35208 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
35209 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
35210 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35211 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35212 ; GCN-NEXT: s_setpc_b64 s[30:31]
35214 ; GFX7-LABEL: v_select_v3bf16:
35216 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35217 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35218 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35219 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35220 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35221 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
35222 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35223 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35224 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35225 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35226 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35227 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35228 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35229 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
35230 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35231 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35232 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
35233 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
35234 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35235 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35236 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35238 ; GFX8-LABEL: v_select_v3bf16:
35240 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35241 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35242 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35243 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35244 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35245 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35247 ; GFX9-LABEL: v_select_v3bf16:
35249 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35250 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35251 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35252 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35253 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35254 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35256 ; GFX10-LABEL: v_select_v3bf16:
35258 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35259 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35260 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35261 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
35262 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
35263 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35265 ; GFX11-LABEL: v_select_v3bf16:
35267 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35268 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35269 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35270 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35271 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
35272 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35273 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
35274 ret <3 x bfloat> %op
35277 define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
35278 ; GCN-LABEL: v_select_v4bf16:
35280 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35281 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35282 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35283 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35284 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35285 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35286 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35287 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35288 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35289 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35290 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35291 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35292 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35293 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35294 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35295 ; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16
35296 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35297 ; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16
35298 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35299 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35300 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35301 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35302 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35303 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35304 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35305 ; GCN-NEXT: s_setpc_b64 s[30:31]
35307 ; GFX7-LABEL: v_select_v4bf16:
35309 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35310 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35311 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35312 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35313 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35314 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35315 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6
35316 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35317 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35318 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35319 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35320 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35321 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
35322 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
35323 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35324 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
35325 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35326 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
35327 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35328 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35329 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35330 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35331 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35332 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35333 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35334 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35336 ; GFX8-LABEL: v_select_v4bf16:
35338 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35339 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35340 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35341 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35342 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35343 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35345 ; GFX9-LABEL: v_select_v4bf16:
35347 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35348 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35349 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35350 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
35351 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
35352 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35354 ; GFX10-LABEL: v_select_v4bf16:
35356 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35357 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35358 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35359 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
35360 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
35361 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35363 ; GFX11-LABEL: v_select_v4bf16:
35365 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35366 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35367 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35368 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35369 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
35370 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35371 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
35372 ret <4 x bfloat> %op
35375 define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
35376 ; GCN-LABEL: v_select_v6bf16:
35378 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35379 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35380 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35381 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35382 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35383 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35384 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35385 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
35386 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
35387 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35388 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35389 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
35390 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
35391 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35392 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35393 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35394 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35395 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35396 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35397 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35398 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35399 ; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16
35400 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35401 ; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16
35402 ; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
35403 ; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16
35404 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35405 ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35406 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35407 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35408 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35409 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35410 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35411 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35412 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35413 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35414 ; GCN-NEXT: s_setpc_b64 s[30:31]
35416 ; GFX7-LABEL: v_select_v6bf16:
35418 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35419 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35420 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35421 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35422 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35423 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35424 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
35425 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35426 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35427 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35428 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35429 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
35430 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35431 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10
35432 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35433 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35434 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
35435 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35436 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9
35437 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
35438 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12
35439 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
35440 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35441 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
35442 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35443 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
35444 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35445 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35446 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35447 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35448 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35449 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35450 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35451 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35452 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35453 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35454 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35456 ; GFX8-LABEL: v_select_v6bf16:
35458 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35459 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35460 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35461 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
35462 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
35463 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
35464 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35466 ; GFX9-LABEL: v_select_v6bf16:
35468 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35469 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35470 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35471 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
35472 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
35473 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
35474 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35476 ; GFX10-LABEL: v_select_v6bf16:
35478 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35479 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35480 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35481 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
35482 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
35483 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
35484 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35486 ; GFX11-LABEL: v_select_v6bf16:
35488 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35489 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35490 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35491 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35492 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
35493 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
35494 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35495 %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
35496 ret <6 x bfloat> %op
35499 define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
35500 ; GCN-LABEL: v_select_v8bf16:
35502 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35503 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
35504 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35505 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
35506 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
35507 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
35508 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35509 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
35510 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
35511 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
35512 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35513 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
35514 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
35515 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
35516 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35517 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
35518 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
35519 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35520 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35521 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35522 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35523 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35524 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35525 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35526 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35527 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35528 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
35529 ; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16
35530 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
35531 ; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16
35532 ; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
35533 ; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16
35534 ; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
35535 ; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16
35536 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35537 ; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35538 ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35539 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35540 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35541 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35542 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35543 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35544 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35545 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35546 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35547 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35548 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35549 ; GCN-NEXT: s_setpc_b64 s[30:31]
35551 ; GFX7-LABEL: v_select_v8bf16:
35553 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35554 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35555 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35556 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35557 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35558 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35559 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10
35560 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35561 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35562 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35563 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35564 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
35565 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35566 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12
35567 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35568 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35569 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
35570 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16
35571 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35572 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11
35573 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
35574 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14
35575 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35576 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
35577 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16
35578 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35579 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13
35580 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
35581 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16
35582 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16
35583 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35584 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15
35585 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35586 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
35587 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35588 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35589 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35590 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35591 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35592 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35593 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35594 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35595 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35596 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35597 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35598 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35599 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35600 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35602 ; GFX8-LABEL: v_select_v8bf16:
35604 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35605 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35606 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35607 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
35608 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
35609 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
35610 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
35611 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35613 ; GFX9-LABEL: v_select_v8bf16:
35615 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35616 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35617 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35618 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
35619 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
35620 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
35621 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
35622 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35624 ; GFX10-LABEL: v_select_v8bf16:
35626 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35627 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35628 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35629 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo
35630 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
35631 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc_lo
35632 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc_lo
35633 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35635 ; GFX11-LABEL: v_select_v8bf16:
35637 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35638 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35639 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35640 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35641 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
35642 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
35643 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35644 %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
35645 ret <8 x bfloat> %op
35648 define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
35649 ; GCN-LABEL: v_select_v16bf16:
35651 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35652 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35653 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35654 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
35655 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35656 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35657 ; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
35658 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
35659 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17
35660 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
35661 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
35662 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4
35663 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
35664 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35665 ; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
35666 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20
35667 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
35668 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35669 ; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
35670 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
35671 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
35672 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35673 ; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
35674 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22
35675 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
35676 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
35677 ; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
35678 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
35679 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
35680 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35681 ; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
35682 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24
35683 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23
35684 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
35685 ; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
35686 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
35687 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
35688 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35689 ; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
35690 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
35691 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25
35692 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
35693 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
35694 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
35695 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27
35696 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
35697 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
35698 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30
35699 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
35700 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
35701 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
35702 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
35703 ; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
35704 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
35705 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35706 ; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
35707 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32
35708 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
35709 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35710 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
35711 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35712 ; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16
35713 ; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
35714 ; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16
35715 ; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
35716 ; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
35717 ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
35718 ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
35719 ; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
35720 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
35721 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
35722 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
35723 ; GCN-NEXT: s_waitcnt vmcnt(1)
35724 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10
35725 ; GCN-NEXT: s_waitcnt vmcnt(0)
35726 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12
35727 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35728 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35729 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35730 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35731 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35732 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35733 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35734 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35735 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
35736 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
35737 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
35738 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
35739 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
35740 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
35741 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35742 ; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16
35743 ; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc
35744 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
35745 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
35746 ; GCN-NEXT: s_setpc_b64 s[30:31]
35748 ; GFX7-LABEL: v_select_v16bf16:
35750 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35751 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
35752 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35753 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
35754 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
35755 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
35756 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
35757 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35758 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
35759 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
35760 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35761 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
35762 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
35763 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
35764 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35765 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
35766 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
35767 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
35768 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35769 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
35770 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
35771 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
35772 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35773 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
35774 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
35775 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
35776 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35777 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21
35778 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
35779 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
35780 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35781 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
35782 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16
35783 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35784 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23
35785 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
35786 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
35787 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16
35788 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35789 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25
35790 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
35791 ; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16
35792 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35793 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
35794 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
35795 ; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
35796 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
35797 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
35798 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
35799 ; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16
35800 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32
35801 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
35802 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35803 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
35804 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
35805 ; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
35806 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
35807 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35808 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
35809 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35810 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
35811 ; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
35812 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
35813 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
35814 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35815 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
35816 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
35817 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
35818 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
35819 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
35820 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
35821 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
35822 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
35823 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35824 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
35825 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
35826 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
35827 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
35828 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
35829 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
35830 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
35831 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
35832 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
35833 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
35834 ; GFX7-NEXT: s_waitcnt vmcnt(1)
35835 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
35836 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35837 ; GFX7-NEXT: s_waitcnt vmcnt(0)
35838 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18
35839 ; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16
35840 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc
35841 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
35842 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
35843 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
35844 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
35845 ; GFX7-NEXT: s_setpc_b64 s[30:31]
35847 ; GFX8-LABEL: v_select_v16bf16:
35849 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35850 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
35851 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35852 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
35853 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
35854 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
35855 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
35856 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
35857 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
35858 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
35859 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
35860 ; GFX8-NEXT: s_setpc_b64 s[30:31]
35862 ; GFX9-LABEL: v_select_v16bf16:
35864 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35865 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
35866 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35867 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
35868 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
35869 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
35870 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
35871 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
35872 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
35873 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
35874 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
35875 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35877 ; GFX10-LABEL: v_select_v16bf16:
35879 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35880 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
35881 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35882 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc_lo
35883 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc_lo
35884 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc_lo
35885 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc_lo
35886 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc_lo
35887 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc_lo
35888 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo
35889 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc_lo
35890 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35892 ; GFX11-LABEL: v_select_v16bf16:
35894 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35895 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
35896 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35897 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
35898 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
35899 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
35900 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
35901 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
35902 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35903 %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
35904 ret <16 x bfloat> %op
35907 define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
35908 ; GCN-LABEL: v_select_v32bf16:
35910 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35911 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
35912 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
35913 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
35914 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
35915 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
35916 ; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
35917 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4
35918 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3
35919 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
35920 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
35921 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6
35922 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
35923 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
35924 ; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
35925 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8
35926 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
35927 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
35928 ; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
35929 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
35930 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9
35931 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
35932 ; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
35933 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12
35934 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
35935 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
35936 ; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
35937 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14
35938 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13
35939 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
35940 ; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
35941 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16
35942 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15
35943 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
35944 ; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
35945 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
35946 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17
35947 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
35948 ; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
35949 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20
35950 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19
35951 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
35952 ; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
35953 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12
35954 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22
35955 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21
35956 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
35957 ; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16
35958 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
35959 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24
35960 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23
35961 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
35962 ; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16
35963 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
35964 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26
35965 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25
35966 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
35967 ; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16
35968 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
35969 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28
35970 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27
35971 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
35972 ; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16
35973 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
35974 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30
35975 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
35976 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
35977 ; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16
35978 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24
35979 ; GCN-NEXT: s_waitcnt vmcnt(5)
35980 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
35981 ; GCN-NEXT: s_waitcnt vmcnt(4)
35982 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
35983 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
35984 ; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
35985 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
35986 ; GCN-NEXT: s_waitcnt vmcnt(4)
35987 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
35988 ; GCN-NEXT: s_waitcnt vmcnt(3)
35989 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
35990 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
35991 ; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
35992 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
35993 ; GCN-NEXT: s_waitcnt vmcnt(3)
35994 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
35995 ; GCN-NEXT: s_waitcnt vmcnt(2)
35996 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
35997 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
35998 ; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16
35999 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44
36000 ; GCN-NEXT: s_waitcnt vmcnt(2)
36001 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
36002 ; GCN-NEXT: s_waitcnt vmcnt(1)
36003 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
36004 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40
36005 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36006 ; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16
36007 ; GCN-NEXT: s_waitcnt vmcnt(1)
36008 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
36009 ; GCN-NEXT: s_waitcnt vmcnt(0)
36010 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
36011 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52
36012 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48
36013 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
36014 ; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16
36015 ; GCN-NEXT: s_waitcnt vmcnt(1)
36016 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
36017 ; GCN-NEXT: s_waitcnt vmcnt(0)
36018 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
36019 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
36020 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56
36021 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36022 ; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16
36023 ; GCN-NEXT: s_waitcnt vmcnt(1)
36024 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
36025 ; GCN-NEXT: s_waitcnt vmcnt(0)
36026 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
36027 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68
36028 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
36029 ; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
36030 ; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16
36031 ; GCN-NEXT: s_waitcnt vmcnt(1)
36032 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
36033 ; GCN-NEXT: s_waitcnt vmcnt(0)
36034 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
36035 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76
36036 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
36037 ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
36038 ; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16
36039 ; GCN-NEXT: s_waitcnt vmcnt(1)
36040 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
36041 ; GCN-NEXT: s_waitcnt vmcnt(0)
36042 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
36043 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84
36044 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
36045 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
36046 ; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16
36047 ; GCN-NEXT: s_waitcnt vmcnt(1)
36048 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
36049 ; GCN-NEXT: s_waitcnt vmcnt(0)
36050 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
36051 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
36052 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
36053 ; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
36054 ; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16
36055 ; GCN-NEXT: s_waitcnt vmcnt(1)
36056 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
36057 ; GCN-NEXT: s_waitcnt vmcnt(0)
36058 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
36059 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100
36060 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
36061 ; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
36062 ; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16
36063 ; GCN-NEXT: s_waitcnt vmcnt(1)
36064 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
36065 ; GCN-NEXT: s_waitcnt vmcnt(0)
36066 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
36067 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108
36068 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
36069 ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
36070 ; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16
36071 ; GCN-NEXT: s_waitcnt vmcnt(1)
36072 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
36073 ; GCN-NEXT: s_waitcnt vmcnt(0)
36074 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
36075 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
36076 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112
36077 ; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
36078 ; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16
36079 ; GCN-NEXT: s_waitcnt vmcnt(1)
36080 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
36081 ; GCN-NEXT: s_waitcnt vmcnt(0)
36082 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
36083 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
36084 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
36085 ; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
36086 ; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16
36087 ; GCN-NEXT: s_waitcnt vmcnt(1)
36088 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
36089 ; GCN-NEXT: s_waitcnt vmcnt(0)
36090 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
36091 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
36092 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
36093 ; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
36094 ; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16
36095 ; GCN-NEXT: s_waitcnt vmcnt(1)
36096 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
36097 ; GCN-NEXT: s_waitcnt vmcnt(0)
36098 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
36099 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
36100 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
36101 ; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
36102 ; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16
36103 ; GCN-NEXT: s_waitcnt vmcnt(1)
36104 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
36105 ; GCN-NEXT: s_waitcnt vmcnt(0)
36106 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
36107 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
36108 ; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
36109 ; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc
36110 ; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc
36111 ; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc
36112 ; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc
36113 ; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc
36114 ; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc
36115 ; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc
36116 ; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc
36117 ; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc
36118 ; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc
36119 ; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc
36120 ; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc
36121 ; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc
36122 ; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc
36123 ; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
36124 ; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc
36125 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
36126 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36127 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
36128 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36129 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
36130 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
36131 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
36132 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
36133 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
36134 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
36135 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
36136 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
36137 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
36138 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
36139 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
36140 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22
36141 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23
36142 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
36143 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24
36144 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v24
36145 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25
36146 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v25
36147 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
36148 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v26
36149 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v27
36150 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v27
36151 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v28
36152 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
36153 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29
36154 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
36155 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
36156 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
36157 ; GCN-NEXT: s_setpc_b64 s[30:31]
36159 ; GFX7-LABEL: v_select_v32bf16:
36161 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36162 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
36163 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36164 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
36165 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
36166 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4
36167 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36168 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
36169 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
36170 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
36171 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36172 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
36173 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
36174 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
36175 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
36176 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
36177 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
36178 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10
36179 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
36180 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
36181 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
36182 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
36183 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
36184 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
36185 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
36186 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
36187 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
36188 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
36189 ; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
36190 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
36191 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
36192 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
36193 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
36194 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
36195 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
36196 ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
36197 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
36198 ; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
36199 ; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16
36200 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
36201 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
36202 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
36203 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
36204 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
36205 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
36206 ; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
36207 ; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16
36208 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
36209 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
36210 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
36211 ; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
36212 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
36213 ; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36214 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
36215 ; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16
36216 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
36217 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
36218 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
36219 ; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
36220 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
36221 ; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
36222 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
36223 ; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
36224 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
36225 ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
36226 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
36227 ; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
36228 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
36229 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36230 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
36231 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
36232 ; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
36233 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
36234 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
36235 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
36236 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
36237 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
36238 ; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
36239 ; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
36240 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
36241 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
36242 ; GFX7-NEXT: s_waitcnt vmcnt(14)
36243 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
36244 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
36245 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
36246 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
36247 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36248 ; GFX7-NEXT: s_waitcnt vmcnt(13)
36249 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
36250 ; GFX7-NEXT: s_waitcnt vmcnt(12)
36251 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
36252 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
36253 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20
36254 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
36255 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
36256 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
36257 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36258 ; GFX7-NEXT: s_waitcnt vmcnt(12)
36259 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
36260 ; GFX7-NEXT: s_waitcnt vmcnt(11)
36261 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
36262 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
36263 ; GFX7-NEXT: s_waitcnt vmcnt(9)
36264 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
36265 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
36266 ; GFX7-NEXT: s_waitcnt vmcnt(7)
36267 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
36268 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
36269 ; GFX7-NEXT: s_waitcnt vmcnt(6)
36270 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
36271 ; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
36272 ; GFX7-NEXT: s_waitcnt vmcnt(5)
36273 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
36274 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
36275 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
36276 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
36277 ; GFX7-NEXT: s_waitcnt vmcnt(4)
36278 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
36279 ; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
36280 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
36281 ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
36282 ; GFX7-NEXT: s_waitcnt vmcnt(3)
36283 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
36284 ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
36285 ; GFX7-NEXT: s_waitcnt vmcnt(1)
36286 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
36287 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
36288 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36289 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
36290 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
36291 ; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
36292 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28
36293 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36294 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
36295 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
36296 ; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
36297 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36
36298 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36299 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
36300 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
36301 ; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
36302 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
36303 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc
36304 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9
36305 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36306 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
36307 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
36308 ; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16
36309 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
36310 ; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc
36311 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
36312 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
36313 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
36314 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36315 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
36316 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
36317 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
36318 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
36319 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
36320 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36321 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36322 ; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16
36323 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
36324 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
36325 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
36326 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
36327 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36328 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36329 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16
36330 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
36331 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
36332 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
36333 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
36334 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36335 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36336 ; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16
36337 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
36338 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
36339 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
36340 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
36341 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36342 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36343 ; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16
36344 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
36345 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
36346 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
36347 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
36348 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36349 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36350 ; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16
36351 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
36352 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
36353 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
36354 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
36355 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36356 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36357 ; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16
36358 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
36359 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
36360 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
36361 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
36362 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36363 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36364 ; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16
36365 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
36366 ; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
36367 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
36368 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
36369 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36370 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36371 ; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16
36372 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
36373 ; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc
36374 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
36375 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
36376 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36377 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36378 ; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16
36379 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
36380 ; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc
36381 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
36382 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
36383 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36384 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36385 ; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16
36386 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
36387 ; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc
36388 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29
36389 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
36390 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36391 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
36392 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
36393 ; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
36394 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
36395 ; GFX7-NEXT: s_waitcnt vmcnt(0)
36396 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
36397 ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
36398 ; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16
36399 ; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
36400 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
36401 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
36402 ; GFX7-NEXT: s_setpc_b64 s[30:31]
36404 ; GFX8-LABEL: v_select_v32bf16:
36406 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36407 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
36408 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36409 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
36410 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
36411 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32
36412 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
36413 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
36414 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
36415 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
36416 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
36417 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
36418 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
36419 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
36420 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
36421 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
36422 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
36423 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
36424 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
36425 ; GFX8-NEXT: s_waitcnt vmcnt(1)
36426 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
36427 ; GFX8-NEXT: s_waitcnt vmcnt(0)
36428 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
36429 ; GFX8-NEXT: s_setpc_b64 s[30:31]
36431 ; GFX9-LABEL: v_select_v32bf16:
36433 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36434 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
36435 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36436 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
36437 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
36438 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32
36439 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
36440 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
36441 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
36442 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
36443 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
36444 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
36445 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
36446 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
36447 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
36448 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
36449 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
36450 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
36451 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
36452 ; GFX9-NEXT: s_waitcnt vmcnt(1)
36453 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
36454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
36455 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
36456 ; GFX9-NEXT: s_setpc_b64 s[30:31]
36458 ; GFX10-LABEL: v_select_v32bf16:
36460 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36461 ; GFX10-NEXT: s_clause 0x1
36462 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
36463 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
36464 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
36465 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
36466 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc_lo
36467 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc_lo
36468 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc_lo
36469 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc_lo
36470 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc_lo
36471 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc_lo
36472 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc_lo
36473 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc_lo
36474 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc_lo
36475 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc_lo
36476 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc_lo
36477 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc_lo
36478 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc_lo
36479 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc_lo
36480 ; GFX10-NEXT: s_waitcnt vmcnt(1)
36481 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc_lo
36482 ; GFX10-NEXT: s_waitcnt vmcnt(0)
36483 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc_lo
36484 ; GFX10-NEXT: s_setpc_b64 s[30:31]
36486 ; GFX11-LABEL: v_select_v32bf16:
36488 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36489 ; GFX11-NEXT: s_clause 0x1
36490 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
36491 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
36492 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
36493 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
36494 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
36495 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
36496 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
36497 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
36498 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
36499 ; GFX11-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
36500 ; GFX11-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
36501 ; GFX11-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
36502 ; GFX11-NEXT: s_waitcnt vmcnt(0)
36503 ; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
36504 ; GFX11-NEXT: s_setpc_b64 s[30:31]
36505 %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
36506 ret <32 x bfloat> %op
36509 define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
36510 ; GCN-LABEL: s_select_v3bf16:
36512 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
36513 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
36514 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4
36515 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3
36516 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
36517 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5
36518 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36519 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36520 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
36521 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
36522 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
36523 ; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
36524 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36525 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
36526 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36527 ; GCN-NEXT: v_readfirstlane_b32 s0, v1
36528 ; GCN-NEXT: v_readfirstlane_b32 s1, v0
36529 ; GCN-NEXT: ; return to shader part epilog
36531 ; GFX7-LABEL: s_select_v3bf16:
36533 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
36534 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36535 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
36536 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
36537 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
36538 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36539 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
36540 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
36541 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
36542 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
36543 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36544 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
36545 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36546 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
36547 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36548 ; GFX7-NEXT: v_readfirstlane_b32 s0, v1
36549 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0
36550 ; GFX7-NEXT: ; return to shader part epilog
36552 ; GFX8-LABEL: s_select_v3bf16:
36554 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
36555 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
36556 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36557 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36558 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
36559 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
36560 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36561 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
36562 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
36563 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
36564 ; GFX8-NEXT: ; return to shader part epilog
36566 ; GFX9-LABEL: s_select_v3bf16:
36568 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
36569 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
36570 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36571 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36572 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
36573 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
36574 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36575 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
36576 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
36577 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
36578 ; GFX9-NEXT: ; return to shader part epilog
36580 ; GFX10-LABEL: s_select_v3bf16:
36582 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
36583 ; GFX10-NEXT: v_mov_b32_e32 v2, s1
36584 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36585 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo
36586 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo
36587 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
36588 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
36589 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
36590 ; GFX10-NEXT: ; return to shader part epilog
36592 ; GFX11-LABEL: s_select_v3bf16:
36594 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
36595 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36596 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
36597 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo
36598 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo
36599 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36600 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
36601 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
36602 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
36603 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
36604 ; GFX11-NEXT: ; return to shader part epilog
36605 %cond = icmp eq i32 %c, 0
36606 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
36607 %cast = bitcast <3 x bfloat> %op to i48
36608 %elt0 = trunc i48 %cast to i32
36609 %elt1.hi = lshr i48 %cast, 32
36610 %elt1 = trunc i48 %elt1.hi to i32
36611 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36612 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36613 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36614 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36615 ret <2 x i32> %bv.1
36618 define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
36619 ; GCN-LABEL: s_select_v4bf16:
36621 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
36622 ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
36623 ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5
36624 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4
36625 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
36626 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2
36627 ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7
36628 ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6
36629 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36630 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36631 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
36632 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
36633 ; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
36634 ; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
36635 ; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16
36636 ; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16
36637 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36638 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
36639 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36640 ; GCN-NEXT: v_readfirstlane_b32 s0, v1
36641 ; GCN-NEXT: v_readfirstlane_b32 s1, v0
36642 ; GCN-NEXT: ; return to shader part epilog
36644 ; GFX7-LABEL: s_select_v4bf16:
36646 ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
36647 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
36648 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
36649 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
36650 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
36651 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36652 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4
36653 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
36654 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
36655 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
36656 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
36657 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
36658 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
36659 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
36660 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6
36661 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
36662 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36663 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
36664 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
36665 ; GFX7-NEXT: v_readfirstlane_b32 s0, v1
36666 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0
36667 ; GFX7-NEXT: ; return to shader part epilog
36669 ; GFX8-LABEL: s_select_v4bf16:
36671 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
36672 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
36673 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36674 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36675 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
36676 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
36677 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36678 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1
36679 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0
36680 ; GFX8-NEXT: ; return to shader part epilog
36682 ; GFX9-LABEL: s_select_v4bf16:
36684 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
36685 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
36686 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36687 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
36688 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
36689 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
36690 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
36691 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
36692 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0
36693 ; GFX9-NEXT: ; return to shader part epilog
36695 ; GFX10-LABEL: s_select_v4bf16:
36697 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
36698 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
36699 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36700 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
36701 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo
36702 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0
36703 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
36704 ; GFX10-NEXT: ; return to shader part epilog
36706 ; GFX11-LABEL: s_select_v4bf16:
36708 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
36709 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36710 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
36711 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
36712 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo
36713 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36714 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
36715 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
36716 ; GFX11-NEXT: ; return to shader part epilog
36717 %cond = icmp eq i32 %c, 0
36718 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
36719 %cast = bitcast <4 x bfloat> %op to <2 x i32>
36720 %elt0 = extractelement <2 x i32> %cast, i32 0
36721 %elt1 = extractelement <2 x i32> %cast, i32 1
36722 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36723 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36724 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36725 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36726 ret <2 x i32> %bv.1
36729 define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
36730 ; GCN-LABEL: s_vselect_v4bf16:
36732 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0
36733 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4
36734 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1
36735 ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5
36736 ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2
36737 ; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6
36738 ; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3
36739 ; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7
36740 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36741 ; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
36742 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36743 ; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
36744 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36745 ; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
36746 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36747 ; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
36748 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36749 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36750 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36751 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
36752 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3
36753 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
36754 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
36755 ; GCN-NEXT: v_readfirstlane_b32 s1, v2
36756 ; GCN-NEXT: ; return to shader part epilog
36758 ; GFX7-LABEL: s_vselect_v4bf16:
36760 ; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3
36761 ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7
36762 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36763 ; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2
36764 ; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6
36765 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
36766 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36767 ; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1
36768 ; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5
36769 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
36770 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36771 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
36772 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4
36773 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
36774 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36775 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
36776 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36777 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
36778 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36779 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
36780 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
36781 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
36782 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
36783 ; GFX7-NEXT: v_readfirstlane_b32 s1, v2
36784 ; GFX7-NEXT: ; return to shader part epilog
36786 ; GFX8-LABEL: s_vselect_v4bf16:
36788 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16
36789 ; GFX8-NEXT: s_lshr_b32 s5, s3, 16
36790 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
36791 ; GFX8-NEXT: v_mov_b32_e32 v5, s4
36792 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36793 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
36794 ; GFX8-NEXT: v_mov_b32_e32 v4, s3
36795 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
36796 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36797 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
36798 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
36799 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
36800 ; GFX8-NEXT: s_lshr_b32 s3, s2, 16
36801 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36802 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
36803 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
36804 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36805 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
36806 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
36807 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
36808 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36809 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
36810 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
36811 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
36812 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
36813 ; GFX8-NEXT: v_readfirstlane_b32 s1, v2
36814 ; GFX8-NEXT: ; return to shader part epilog
36816 ; GFX9-LABEL: s_vselect_v4bf16:
36818 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16
36819 ; GFX9-NEXT: s_lshr_b32 s5, s3, 16
36820 ; GFX9-NEXT: v_mov_b32_e32 v4, s5
36821 ; GFX9-NEXT: v_mov_b32_e32 v5, s4
36822 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
36823 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
36824 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
36825 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
36826 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
36827 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
36828 ; GFX9-NEXT: s_mov_b32 s1, 0x5040100
36829 ; GFX9-NEXT: s_lshr_b32 s3, s0, 16
36830 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16
36831 ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1
36832 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
36833 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
36834 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
36835 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
36836 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
36837 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
36838 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
36839 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
36840 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1
36841 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
36842 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
36843 ; GFX9-NEXT: ; return to shader part epilog
36845 ; GFX10-LABEL: s_vselect_v4bf16:
36847 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
36848 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
36849 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
36850 ; GFX10-NEXT: s_lshr_b32 s4, s3, 16
36851 ; GFX10-NEXT: s_lshr_b32 s5, s0, 16
36852 ; GFX10-NEXT: v_mov_b32_e32 v6, s0
36853 ; GFX10-NEXT: s_lshr_b32 s0, s2, 16
36854 ; GFX10-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo
36855 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
36856 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
36857 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
36858 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo
36859 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36860 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo
36861 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
36862 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
36863 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo
36864 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
36865 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
36866 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
36867 ; GFX10-NEXT: ; return to shader part epilog
36869 ; GFX11TRUE16-LABEL: s_vselect_v4bf16:
36870 ; GFX11TRUE16: ; %bb.0:
36871 ; GFX11TRUE16-NEXT: s_lshr_b32 s4, s3, 16
36872 ; GFX11TRUE16-NEXT: s_lshr_b32 s5, s1, 16
36873 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s4
36874 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s5
36875 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
36876 ; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
36877 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, s2
36878 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, s0
36879 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, s4
36880 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
36881 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
36882 ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
36883 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
36884 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
36885 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, s1
36886 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
36887 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
36888 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36889 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
36890 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
36891 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc_lo
36892 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
36893 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36894 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
36895 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
36896 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
36897 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1
36898 ; GFX11TRUE16-NEXT: ; return to shader part epilog
36900 ; GFX11FAKE16-LABEL: s_vselect_v4bf16:
36901 ; GFX11FAKE16: ; %bb.0:
36902 ; GFX11FAKE16-NEXT: s_lshr_b32 s4, s1, 16
36903 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
36904 ; GFX11FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
36905 ; GFX11FAKE16-NEXT: s_lshr_b32 s4, s3, 16
36906 ; GFX11FAKE16-NEXT: s_lshr_b32 s5, s0, 16
36907 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
36908 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo
36909 ; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, s5
36910 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
36911 ; GFX11FAKE16-NEXT: v_mov_b32_e32 v6, s0
36912 ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16
36913 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
36914 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo
36915 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
36916 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
36917 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo
36918 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
36919 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
36920 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo
36921 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36922 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
36923 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
36924 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
36925 ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1
36926 ; GFX11FAKE16-NEXT: ; return to shader part epilog
36927 %cond = icmp eq <4 x i32> %c, zeroinitializer
36928 %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
36929 %cast = bitcast <4 x bfloat> %op to <2 x i32>
36930 %elt0 = extractelement <2 x i32> %cast, i32 0
36931 %elt1 = extractelement <2 x i32> %cast, i32 1
36932 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
36933 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
36934 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
36935 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
36936 ret <2 x i32> %bv.1
36939 define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
36940 ; GCN-LABEL: v_vselect_v4bf16:
36942 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36943 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
36944 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
36945 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
36946 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
36947 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
36948 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
36949 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
36950 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
36951 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2
36952 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
36953 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
36954 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3
36955 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
36956 ; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
36957 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
36958 ; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
36959 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
36960 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
36961 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36962 ; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
36963 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
36964 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36965 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
36966 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36967 ; GCN-NEXT: s_setpc_b64 s[30:31]
36969 ; GFX7-LABEL: v_vselect_v4bf16:
36971 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36972 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
36973 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
36974 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
36975 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
36976 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
36977 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
36978 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
36979 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
36980 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
36981 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
36982 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
36983 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
36984 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
36985 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
36986 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
36987 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
36988 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
36989 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
36990 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
36991 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
36992 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
36993 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
36994 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
36995 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
36996 ; GFX7-NEXT: s_setpc_b64 s[30:31]
36998 ; GFX8-LABEL: v_vselect_v4bf16:
37000 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37001 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
37002 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
37003 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v5
37004 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
37005 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37006 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
37007 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
37008 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37009 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
37010 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
37011 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
37012 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
37013 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37014 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
37015 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37016 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
37017 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
37018 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37019 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
37020 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37021 ; GFX8-NEXT: s_setpc_b64 s[30:31]
37023 ; GFX9-LABEL: v_vselect_v4bf16:
37025 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37026 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
37027 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
37028 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37029 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
37030 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
37031 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37032 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37033 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37034 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
37035 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
37036 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37037 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
37038 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
37039 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6
37040 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37041 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
37042 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
37043 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
37044 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
37045 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37047 ; GFX10-LABEL: v_vselect_v4bf16:
37049 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37050 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
37051 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
37052 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
37053 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
37054 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4
37055 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37056 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
37057 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
37058 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37059 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37060 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37061 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
37062 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37063 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
37064 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37065 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37066 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37067 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37068 ; GFX10-NEXT: s_setpc_b64 s[30:31]
37070 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
37071 ; GFX11TRUE16: ; %bb.0:
37072 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37073 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
37074 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
37075 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
37076 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37077 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37078 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37079 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37080 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
37081 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc_lo
37082 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
37083 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37084 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
37085 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6
37086 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4
37087 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
37088 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37089 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
37090 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
37091 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37092 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37093 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37094 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37095 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37096 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37098 ; GFX11FAKE16-LABEL: v_vselect_v4bf16:
37099 ; GFX11FAKE16: ; %bb.0:
37100 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37101 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
37102 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
37103 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
37104 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
37105 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37106 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
37107 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
37108 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
37109 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
37110 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37111 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
37112 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37113 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37114 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
37115 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37116 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37117 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
37118 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37119 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37120 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
37121 %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
37122 ret <4 x bfloat> %op
37125 define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
37126 ; GCN-LABEL: v_vselect_v8bf16:
37128 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37129 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37130 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
37131 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
37132 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
37133 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
37134 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
37135 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
37136 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
37137 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2
37138 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
37139 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
37140 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3
37141 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
37142 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
37143 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4
37144 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
37145 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
37146 ; GCN-NEXT: v_and_b32_e32 v5, 1, v5
37147 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
37148 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
37149 ; GCN-NEXT: v_and_b32_e32 v6, 1, v6
37150 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
37151 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
37152 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7
37153 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37154 ; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
37155 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37156 ; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
37157 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37158 ; GCN-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
37159 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37160 ; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
37161 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37162 ; GCN-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
37163 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37164 ; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
37165 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37166 ; GCN-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
37167 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37168 ; GCN-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
37169 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37170 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37171 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37172 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37173 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
37174 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
37175 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
37176 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37177 ; GCN-NEXT: s_setpc_b64 s[30:31]
37179 ; GFX7-LABEL: v_vselect_v8bf16:
37181 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37182 ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
37183 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
37184 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
37185 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
37186 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37187 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
37188 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
37189 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
37190 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
37191 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37192 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
37193 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
37194 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
37195 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
37196 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37197 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
37198 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
37199 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
37200 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
37201 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37202 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
37203 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
37204 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
37205 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
37206 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37207 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
37208 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
37209 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
37210 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
37211 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37212 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
37213 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
37214 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
37215 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
37216 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37217 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
37218 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
37219 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
37220 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37221 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
37222 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37223 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37224 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
37225 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
37226 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
37227 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
37228 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
37229 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37230 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37232 ; GFX8-LABEL: v_vselect_v8bf16:
37234 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37235 ; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
37236 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
37237 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v11
37238 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
37239 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37240 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
37241 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc
37242 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37243 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
37244 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
37245 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
37246 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14
37247 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37248 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
37249 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v11, vcc
37250 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37251 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
37252 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
37253 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
37254 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v13
37255 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37256 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
37257 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
37258 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37259 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
37260 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
37261 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8
37262 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v12
37263 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37264 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc
37265 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37266 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
37267 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
37268 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37269 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
37270 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37271 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
37272 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7
37273 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37274 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37275 ; GFX8-NEXT: s_setpc_b64 s[30:31]
37277 ; GFX9-LABEL: v_vselect_v8bf16:
37279 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37280 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
37281 ; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
37282 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
37283 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
37284 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
37285 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37286 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37287 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
37288 ; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
37289 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc
37290 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
37291 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
37292 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
37293 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
37294 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14
37295 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
37296 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
37297 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc
37298 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37299 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
37300 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
37301 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37302 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13
37303 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37304 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
37305 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
37306 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37307 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
37308 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
37309 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12
37310 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37311 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
37312 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
37313 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
37314 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
37315 ; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
37316 ; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4
37317 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37319 ; GFX10-LABEL: v_vselect_v8bf16:
37321 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37322 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
37323 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
37324 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
37325 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
37326 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10
37327 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
37328 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14
37329 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
37330 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
37331 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
37332 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
37333 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
37334 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
37335 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37336 ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37337 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
37338 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
37339 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
37340 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12
37341 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
37342 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37343 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
37344 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37345 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37346 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
37347 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37348 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37349 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
37350 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37351 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37352 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37353 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
37354 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37355 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37356 ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
37357 ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
37358 ; GFX10-NEXT: s_setpc_b64 s[30:31]
37360 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
37361 ; GFX11TRUE16: ; %bb.0:
37362 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37363 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
37364 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v11.l
37365 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37366 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37367 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
37368 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
37369 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
37370 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
37371 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
37372 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc_lo
37373 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
37374 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
37375 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v14
37376 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v10
37377 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
37378 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v13.l
37379 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
37380 ; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
37381 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
37382 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37383 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
37384 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
37385 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
37386 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37387 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc_lo
37388 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37389 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc_lo
37390 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
37391 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37392 ; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
37393 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v12
37394 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v8
37395 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37396 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37397 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
37398 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc_lo
37399 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37400 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37401 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37402 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
37403 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37404 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37405 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37406 ; GFX11TRUE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
37407 ; GFX11TRUE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
37408 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37410 ; GFX11FAKE16-LABEL: v_vselect_v8bf16:
37411 ; GFX11FAKE16: ; %bb.0:
37412 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37413 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v10
37414 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14
37415 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
37416 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
37417 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
37418 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
37419 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
37420 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
37421 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
37422 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15
37423 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
37424 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
37425 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
37426 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
37427 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
37428 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12
37429 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
37430 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
37431 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37432 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
37433 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
37434 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37435 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
37436 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
37437 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
37438 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37439 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
37440 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37441 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
37442 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37443 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
37444 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
37445 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
37446 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
37447 ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
37448 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
37449 ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
37450 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
37451 %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
37452 ret <8 x bfloat> %op
37455 define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
37456 ; GCN-LABEL: v_vselect_v16bf16:
37458 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37459 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
37460 ; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
37461 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
37462 ; GCN-NEXT: s_waitcnt expcnt(0)
37463 ; GCN-NEXT: v_writelane_b32 v31, s30, 0
37464 ; GCN-NEXT: v_writelane_b32 v31, s31, 1
37465 ; GCN-NEXT: v_writelane_b32 v31, s34, 2
37466 ; GCN-NEXT: v_writelane_b32 v31, s35, 3
37467 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
37468 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37469 ; GCN-NEXT: v_and_b32_e32 v0, 1, v1
37470 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
37471 ; GCN-NEXT: v_and_b32_e32 v0, 1, v2
37472 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
37473 ; GCN-NEXT: v_and_b32_e32 v0, 1, v3
37474 ; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
37475 ; GCN-NEXT: v_and_b32_e32 v0, 1, v4
37476 ; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
37477 ; GCN-NEXT: v_and_b32_e32 v0, 1, v5
37478 ; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
37479 ; GCN-NEXT: v_and_b32_e32 v0, 1, v6
37480 ; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
37481 ; GCN-NEXT: v_and_b32_e32 v0, 1, v7
37482 ; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
37483 ; GCN-NEXT: v_and_b32_e32 v0, 1, v8
37484 ; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
37485 ; GCN-NEXT: v_and_b32_e32 v0, 1, v9
37486 ; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
37487 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
37488 ; GCN-NEXT: v_and_b32_e32 v1, 1, v10
37489 ; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
37490 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
37491 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
37492 ; GCN-NEXT: v_and_b32_e32 v3, 1, v11
37493 ; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3
37494 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
37495 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
37496 ; GCN-NEXT: v_and_b32_e32 v5, 1, v12
37497 ; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5
37498 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
37499 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
37500 ; GCN-NEXT: v_and_b32_e32 v7, 1, v13
37501 ; GCN-NEXT: v_and_b32_e32 v8, 1, v14
37502 ; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
37503 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32
37504 ; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8
37505 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
37506 ; GCN-NEXT: v_and_b32_e32 v9, 1, v15
37507 ; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9
37508 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60
37509 ; GCN-NEXT: s_waitcnt vmcnt(2)
37510 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37511 ; GCN-NEXT: s_waitcnt vmcnt(1)
37512 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37513 ; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35]
37514 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56
37515 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
37516 ; GCN-NEXT: s_waitcnt vmcnt(1)
37517 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
37518 ; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31]
37519 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
37520 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29
37521 ; GCN-NEXT: s_waitcnt vmcnt(1)
37522 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37523 ; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29]
37524 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
37525 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28
37526 ; GCN-NEXT: s_waitcnt vmcnt(1)
37527 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37528 ; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27]
37529 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
37530 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27
37531 ; GCN-NEXT: s_waitcnt vmcnt(1)
37532 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37533 ; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25]
37534 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40
37535 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
37536 ; GCN-NEXT: s_waitcnt vmcnt(1)
37537 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37538 ; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23]
37539 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
37540 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25
37541 ; GCN-NEXT: s_waitcnt vmcnt(1)
37542 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37543 ; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21]
37544 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
37545 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24
37546 ; GCN-NEXT: s_waitcnt vmcnt(1)
37547 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
37548 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19]
37549 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28
37550 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
37551 ; GCN-NEXT: s_waitcnt vmcnt(1)
37552 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
37553 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17]
37554 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
37555 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
37556 ; GCN-NEXT: s_waitcnt vmcnt(1)
37557 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
37558 ; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15]
37559 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
37560 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
37561 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
37562 ; GCN-NEXT: s_waitcnt vmcnt(1)
37563 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
37564 ; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
37565 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
37566 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
37567 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
37568 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
37569 ; GCN-NEXT: s_waitcnt vmcnt(1)
37570 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
37571 ; GCN-NEXT: s_waitcnt vmcnt(0)
37572 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
37573 ; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11]
37574 ; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9]
37575 ; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
37576 ; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
37577 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
37578 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37579 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
37580 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
37581 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
37582 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
37583 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
37584 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
37585 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37586 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
37587 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
37588 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
37589 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
37590 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
37591 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
37592 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
37593 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
37594 ; GCN-NEXT: v_readlane_b32 s35, v31, 3
37595 ; GCN-NEXT: v_readlane_b32 s34, v31, 2
37596 ; GCN-NEXT: v_readlane_b32 s31, v31, 1
37597 ; GCN-NEXT: v_readlane_b32 s30, v31, 0
37598 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
37599 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
37600 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
37601 ; GCN-NEXT: s_waitcnt vmcnt(0)
37602 ; GCN-NEXT: s_setpc_b64 s[30:31]
37604 ; GFX7-LABEL: v_vselect_v16bf16:
37606 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37607 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
37608 ; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
37609 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
37610 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
37611 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37612 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
37613 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
37614 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v2
37615 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
37616 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
37617 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
37618 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
37619 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
37620 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
37621 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
37622 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
37623 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
37624 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
37625 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
37626 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
37627 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
37628 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v9
37629 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
37630 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
37631 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
37632 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v11
37633 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
37634 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
37635 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
37636 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v12
37637 ; GFX7-NEXT: v_writelane_b32 v31, s30, 0
37638 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
37639 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
37640 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v13
37641 ; GFX7-NEXT: v_writelane_b32 v31, s31, 1
37642 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
37643 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56
37644 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v14
37645 ; GFX7-NEXT: v_writelane_b32 v31, s34, 2
37646 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
37647 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52
37648 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v15
37649 ; GFX7-NEXT: v_writelane_b32 v31, s35, 3
37650 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
37651 ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48
37652 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
37653 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
37654 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
37655 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
37656 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
37657 ; GFX7-NEXT: s_waitcnt vmcnt(5)
37658 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
37659 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37660 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
37661 ; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35]
37662 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
37663 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
37664 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37665 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
37666 ; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31]
37667 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
37668 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29
37669 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37670 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
37671 ; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29]
37672 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
37673 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28
37674 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37675 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
37676 ; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27]
37677 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32
37678 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27
37679 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37680 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
37681 ; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25]
37682 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28
37683 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26
37684 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
37685 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
37686 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
37687 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
37688 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
37689 ; GFX7-NEXT: v_readlane_b32 s35, v31, 3
37690 ; GFX7-NEXT: v_readlane_b32 s34, v31, 2
37691 ; GFX7-NEXT: v_readlane_b32 s31, v31, 1
37692 ; GFX7-NEXT: v_readlane_b32 s30, v31, 0
37693 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37694 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
37695 ; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23]
37696 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
37697 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25
37698 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37699 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
37700 ; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21]
37701 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24
37702 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
37703 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37704 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
37705 ; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19]
37706 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
37707 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23
37708 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37709 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
37710 ; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17]
37711 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
37712 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22
37713 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37714 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
37715 ; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15]
37716 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
37717 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
37718 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
37719 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
37720 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
37721 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
37722 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
37723 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37724 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
37725 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13]
37726 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
37727 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
37728 ; GFX7-NEXT: s_waitcnt vmcnt(4)
37729 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
37730 ; GFX7-NEXT: s_waitcnt vmcnt(3)
37731 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
37732 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5]
37733 ; GFX7-NEXT: s_waitcnt vmcnt(2)
37734 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
37735 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7]
37736 ; GFX7-NEXT: s_waitcnt vmcnt(1)
37737 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
37738 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9]
37739 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37740 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
37741 ; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11]
37742 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc
37743 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
37744 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
37745 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
37746 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
37747 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
37748 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
37749 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
37750 ; GFX7-NEXT: s_mov_b64 exec, s[4:5]
37751 ; GFX7-NEXT: s_waitcnt vmcnt(0)
37752 ; GFX7-NEXT: s_setpc_b64 s[30:31]
37754 ; GFX8-LABEL: v_vselect_v16bf16:
37756 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37757 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
37758 ; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
37759 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
37760 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
37761 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37762 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
37763 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
37764 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
37765 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
37766 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v3
37767 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
37768 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
37769 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
37770 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v5
37771 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
37772 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
37773 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
37774 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v7
37775 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
37776 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v8
37777 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
37778 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v9
37779 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
37780 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
37781 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
37782 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
37783 ; GFX8-NEXT: v_writelane_b32 v31, s30, 0
37784 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v12
37785 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v13
37786 ; GFX8-NEXT: v_writelane_b32 v31, s31, 1
37787 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
37788 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v22
37789 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
37790 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v30
37791 ; GFX8-NEXT: v_writelane_b32 v31, s34, 2
37792 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v11
37793 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v14
37794 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v15
37795 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29]
37796 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v20
37797 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v28
37798 ; GFX8-NEXT: v_writelane_b32 v31, s35, 3
37799 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1
37800 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
37801 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
37802 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
37803 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v2, s[20:21]
37804 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v21
37805 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v29
37806 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[24:25]
37807 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v19
37808 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v27
37809 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v4, s[16:17]
37810 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24
37811 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27]
37812 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v27, v19, s[14:15]
37813 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
37814 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
37815 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23]
37816 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19]
37817 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v26, v18, s[10:11]
37818 ; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7]
37819 ; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37820 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10
37821 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
37822 ; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37823 ; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37824 ; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37825 ; GFX8-NEXT: s_waitcnt vmcnt(0)
37826 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
37827 ; GFX8-NEXT: v_cndmask_b32_e64 v12, v0, v23, s[30:31]
37828 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[34:35]
37829 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
37830 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
37831 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
37832 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17
37833 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25
37834 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9]
37835 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16
37836 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5]
37837 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc
37838 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
37839 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
37840 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
37841 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13
37842 ; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37843 ; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37844 ; GFX8-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37845 ; GFX8-NEXT: v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
37846 ; GFX8-NEXT: v_readlane_b32 s35, v31, 3
37847 ; GFX8-NEXT: v_readlane_b32 s34, v31, 2
37848 ; GFX8-NEXT: v_readlane_b32 s31, v31, 1
37849 ; GFX8-NEXT: v_readlane_b32 s30, v31, 0
37850 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
37851 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
37852 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
37853 ; GFX8-NEXT: s_waitcnt vmcnt(0)
37854 ; GFX8-NEXT: s_setpc_b64 s[30:31]
37856 ; GFX9-LABEL: v_vselect_v16bf16:
37858 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37859 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
37860 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v4
37861 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v14
37862 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v4
37863 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v15
37864 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v4
37865 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32
37866 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
37867 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
37868 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v13
37869 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
37870 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
37871 ; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
37872 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
37873 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10
37874 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v11
37875 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
37876 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v6
37877 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v7
37878 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v5
37879 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v22
37880 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v30
37881 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v10
37882 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8
37883 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v9
37884 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5]
37885 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v21
37886 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v29
37887 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v8
37888 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9]
37889 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v20
37890 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v28
37891 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v6
37892 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[12:13]
37893 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19
37894 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v27
37895 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
37896 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[16:17]
37897 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23
37898 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v30, v22, vcc
37899 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
37900 ; GFX9-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[18:19]
37901 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
37902 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
37903 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
37904 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
37905 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
37906 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
37907 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[6:7]
37908 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v20, s[10:11]
37909 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
37910 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
37911 ; GFX9-NEXT: v_perm_b32 v5, v5, v8, s4
37912 ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s4
37913 ; GFX9-NEXT: s_waitcnt vmcnt(0)
37914 ; GFX9-NEXT: v_cndmask_b32_e64 v14, v4, v23, s[20:21]
37915 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
37916 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[22:23]
37917 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v18
37918 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v26
37919 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[24:25]
37920 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v25
37921 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc
37922 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
37923 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
37924 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
37925 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24
37926 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
37927 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc
37928 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
37929 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
37930 ; GFX9-NEXT: v_perm_b32 v2, v4, v15, s4
37931 ; GFX9-NEXT: v_perm_b32 v3, v11, v12, s4
37932 ; GFX9-NEXT: v_perm_b32 v4, v9, v10, s4
37933 ; GFX9-NEXT: v_perm_b32 v7, v13, v14, s4
37934 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37936 ; GFX10-LABEL: v_vselect_v16bf16:
37938 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37939 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
37940 ; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
37941 ; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
37942 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
37943 ; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v22
37944 ; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v30
37945 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
37946 ; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
37947 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
37948 ; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21
37949 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29
37950 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo
37951 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
37952 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
37953 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
37954 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20
37955 ; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28
37956 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
37957 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
37958 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
37959 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
37960 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
37961 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
37962 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
37963 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
37964 ; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17
37965 ; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v25
37966 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
37967 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
37968 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
37969 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
37970 ; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v16
37971 ; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v24
37972 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
37973 ; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18
37974 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
37975 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
37976 ; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v26
37977 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
37978 ; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v19
37979 ; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v27
37980 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo
37981 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
37982 ; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
37983 ; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v23
37984 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
37985 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
37986 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo
37987 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
37988 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
37989 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37990 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
37991 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37992 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
37993 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
37994 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
37995 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
37996 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
37997 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
37998 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
37999 ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
38000 ; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
38001 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo
38002 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
38003 ; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
38004 ; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
38005 ; GFX10-NEXT: s_waitcnt vmcnt(0)
38006 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31
38007 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
38008 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
38009 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
38010 ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
38011 ; GFX10-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
38012 ; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
38013 ; GFX10-NEXT: s_setpc_b64 s[30:31]
38015 ; GFX11TRUE16-LABEL: v_vselect_v16bf16:
38016 ; GFX11TRUE16: ; %bb.0:
38017 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38018 ; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
38019 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v49.l, v26.l
38020 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
38021 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
38022 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
38023 ; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
38024 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
38025 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v34.l, v22.l
38026 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v53.l, v24.l
38027 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v54.l, v16.l
38028 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
38029 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
38030 ; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
38031 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
38032 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
38033 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
38034 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v36.l, v21.l
38035 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v51.l, v25.l
38036 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc_lo
38037 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l
38038 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
38039 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v25
38040 ; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
38041 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
38042 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
38043 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v37.l, v28.l
38044 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v38.l, v20.l
38045 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
38046 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v36, vcc_lo
38047 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v28
38048 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
38049 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
38050 ; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
38051 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
38052 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l
38053 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
38054 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v37, v38, vcc_lo
38055 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
38056 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
38057 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
38058 ; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
38059 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
38060 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29
38061 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
38062 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v39, v48, vcc_lo
38063 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
38064 ; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
38065 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v22
38066 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
38067 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
38068 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v49, v50, vcc_lo
38069 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
38070 ; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
38071 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.l
38072 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v23
38073 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v51, v52, vcc_lo
38074 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
38075 ; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
38076 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v53, v54, vcc_lo
38077 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
38078 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
38079 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc_lo
38080 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
38081 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc_lo
38082 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
38083 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v28, v20, vcc_lo
38084 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
38085 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v27, v19, vcc_lo
38086 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38087 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v24, v16, vcc_lo
38088 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
38089 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
38090 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
38091 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v25, v17, vcc_lo
38092 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
38093 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
38094 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v26, v18, vcc_lo
38095 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38096 ; GFX11TRUE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
38097 ; GFX11TRUE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
38098 ; GFX11TRUE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
38099 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
38100 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v31
38101 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.l
38102 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
38103 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l
38104 ; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
38105 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
38106 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
38107 ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v14, v17, v32 :: v_dual_and_b32 v15, 1, v15
38108 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
38109 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v3, v23, vcc_lo
38110 ; GFX11TRUE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
38111 ; GFX11TRUE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
38112 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
38113 ; GFX11TRUE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
38114 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
38116 ; GFX11FAKE16-LABEL: v_vselect_v16bf16:
38117 ; GFX11FAKE16: ; %bb.0:
38118 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38119 ; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32
38120 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
38121 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
38122 ; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
38123 ; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
38124 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
38125 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
38126 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
38127 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
38128 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
38129 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26
38130 ; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
38131 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
38132 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
38133 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
38134 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
38135 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
38136 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
38137 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
38138 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
38139 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
38140 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
38141 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
38142 ; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
38143 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
38144 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
38145 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
38146 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
38147 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
38148 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
38149 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
38150 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
38151 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
38152 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25
38153 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
38154 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
38155 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
38156 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
38157 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
38158 ; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
38159 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo
38160 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
38161 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
38162 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
38163 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
38164 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
38165 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
38166 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
38167 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
38168 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
38169 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
38170 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38171 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
38172 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
38173 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
38174 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
38175 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
38176 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
38177 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
38178 ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
38179 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo
38180 ; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
38181 ; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
38182 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
38183 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v31
38184 ; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
38185 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38186 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
38187 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
38188 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
38189 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
38190 ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
38191 ; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
38192 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
38193 ; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
38194 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
38195 %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
38196 ret <16 x bfloat> %op
38199 define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
38200 ; GCN-LABEL: v_vselect_v32bf16:
38202 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38203 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
38204 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
38205 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
38206 ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
38207 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
38208 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
38209 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
38210 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
38211 ; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
38212 ; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
38213 ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
38214 ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
38215 ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
38216 ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
38217 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0
38218 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
38219 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2
38220 ; GCN-NEXT: v_and_b32_e32 v36, 1, v13
38221 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
38222 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
38223 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
38224 ; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184
38225 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
38226 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188
38227 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
38228 ; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192
38229 ; GCN-NEXT: v_and_b32_e32 v53, 1, v26
38230 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84
38231 ; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88
38232 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92
38233 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96
38234 ; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100
38235 ; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
38236 ; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
38237 ; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112
38238 ; GCN-NEXT: v_and_b32_e32 v27, 1, v27
38239 ; GCN-NEXT: v_and_b32_e32 v28, 1, v28
38240 ; GCN-NEXT: v_and_b32_e32 v29, 1, v29
38241 ; GCN-NEXT: v_and_b32_e32 v30, 1, v30
38242 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116
38243 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120
38244 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124
38245 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
38246 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252
38247 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248
38248 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244
38249 ; GCN-NEXT: s_waitcnt expcnt(6)
38250 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240
38251 ; GCN-NEXT: s_waitcnt vmcnt(14)
38252 ; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37
38253 ; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38
38254 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36
38255 ; GCN-NEXT: s_waitcnt vmcnt(5)
38256 ; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43
38257 ; GCN-NEXT: s_waitcnt vmcnt(3)
38258 ; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44
38259 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30
38260 ; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5]
38261 ; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236
38262 ; GCN-NEXT: s_waitcnt expcnt(5)
38263 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:232
38264 ; GCN-NEXT: s_waitcnt expcnt(4)
38265 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228
38266 ; GCN-NEXT: s_waitcnt expcnt(3)
38267 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:224
38268 ; GCN-NEXT: s_waitcnt expcnt(2)
38269 ; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:220
38270 ; GCN-NEXT: s_waitcnt expcnt(1)
38271 ; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:216
38272 ; GCN-NEXT: s_waitcnt expcnt(0)
38273 ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212
38274 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128
38275 ; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
38276 ; GCN-NEXT: s_waitcnt vmcnt(10)
38277 ; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45
38278 ; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
38279 ; GCN-NEXT: s_waitcnt vmcnt(9)
38280 ; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46
38281 ; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
38282 ; GCN-NEXT: s_waitcnt vmcnt(8)
38283 ; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47
38284 ; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
38285 ; GCN-NEXT: s_waitcnt vmcnt(7)
38286 ; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36
38287 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29
38288 ; GCN-NEXT: v_cndmask_b32_e64 v29, v43, v42, s[4:5]
38289 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
38290 ; GCN-NEXT: v_cndmask_b32_e64 v28, v44, v41, s[4:5]
38291 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27
38292 ; GCN-NEXT: v_cndmask_b32_e64 v27, v45, v55, s[4:5]
38293 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v53
38294 ; GCN-NEXT: v_cndmask_b32_e64 v36, v36, v54, s[4:5]
38295 ; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4
38296 ; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132
38297 ; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8
38298 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136
38299 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
38300 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140
38301 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16
38302 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144
38303 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3
38304 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4
38305 ; GCN-NEXT: v_and_b32_e32 v5, 1, v5
38306 ; GCN-NEXT: v_and_b32_e32 v6, 1, v6
38307 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18
38308 ; GCN-NEXT: v_and_b32_e32 v22, 1, v22
38309 ; GCN-NEXT: v_and_b32_e32 v23, 1, v23
38310 ; GCN-NEXT: v_and_b32_e32 v24, 1, v24
38311 ; GCN-NEXT: v_and_b32_e32 v25, 1, v25
38312 ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
38313 ; GCN-NEXT: s_waitcnt vmcnt(14)
38314 ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56
38315 ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
38316 ; GCN-NEXT: s_waitcnt vmcnt(13)
38317 ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57
38318 ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
38319 ; GCN-NEXT: s_waitcnt vmcnt(12)
38320 ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58
38321 ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
38322 ; GCN-NEXT: s_waitcnt vmcnt(11)
38323 ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59
38324 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25
38325 ; GCN-NEXT: v_cndmask_b32_e64 v25, v46, v52, s[4:5]
38326 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
38327 ; GCN-NEXT: v_cndmask_b32_e64 v24, v47, v51, s[4:5]
38328 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23
38329 ; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5]
38330 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22
38331 ; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5]
38332 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
38333 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196
38334 ; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
38335 ; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200
38336 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76
38337 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204
38338 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80
38339 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208
38340 ; GCN-NEXT: v_and_b32_e32 v19, 1, v19
38341 ; GCN-NEXT: v_and_b32_e32 v20, 1, v20
38342 ; GCN-NEXT: v_and_b32_e32 v21, 1, v21
38343 ; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
38344 ; GCN-NEXT: s_waitcnt vmcnt(14)
38345 ; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60
38346 ; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
38347 ; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61
38348 ; GCN-NEXT: s_waitcnt vmcnt(3)
38349 ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
38350 ; GCN-NEXT: s_waitcnt vmcnt(2)
38351 ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
38352 ; GCN-NEXT: s_waitcnt vmcnt(1)
38353 ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
38354 ; GCN-NEXT: s_waitcnt vmcnt(0)
38355 ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
38356 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21
38357 ; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5]
38358 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20
38359 ; GCN-NEXT: v_cndmask_b32_e64 v20, v59, v39, s[4:5]
38360 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19
38361 ; GCN-NEXT: v_cndmask_b32_e64 v19, v57, v56, s[4:5]
38362 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
38363 ; GCN-NEXT: v_cndmask_b32_e64 v18, v47, v46, s[4:5]
38364 ; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
38365 ; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:148
38366 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24
38367 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152
38368 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28
38369 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156
38370 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
38371 ; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160
38372 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7
38373 ; GCN-NEXT: v_and_b32_e32 v8, 1, v8
38374 ; GCN-NEXT: v_and_b32_e32 v9, 1, v9
38375 ; GCN-NEXT: v_and_b32_e32 v10, 1, v10
38376 ; GCN-NEXT: v_and_b32_e32 v14, 1, v14
38377 ; GCN-NEXT: v_and_b32_e32 v15, 1, v15
38378 ; GCN-NEXT: v_and_b32_e32 v16, 1, v16
38379 ; GCN-NEXT: v_and_b32_e32 v17, 1, v17
38380 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
38381 ; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
38382 ; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
38383 ; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
38384 ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
38385 ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
38386 ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
38387 ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
38388 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17
38389 ; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5]
38390 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
38391 ; GCN-NEXT: v_cndmask_b32_e64 v16, v50, v49, s[4:5]
38392 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15
38393 ; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5]
38394 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
38395 ; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5]
38396 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
38397 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164
38398 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40
38399 ; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168
38400 ; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44
38401 ; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172
38402 ; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
38403 ; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176
38404 ; GCN-NEXT: v_and_b32_e32 v11, 1, v11
38405 ; GCN-NEXT: v_and_b32_e32 v12, 1, v12
38406 ; GCN-NEXT: v_cndmask_b32_e32 v38, v38, v40, vcc
38407 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256
38408 ; GCN-NEXT: v_and_b32_e32 v26, 1, v26
38409 ; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53
38410 ; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
38411 ; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
38412 ; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
38413 ; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
38414 ; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43
38415 ; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44
38416 ; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45
38417 ; GCN-NEXT: s_waitcnt vmcnt(14)
38418 ; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
38419 ; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
38420 ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
38421 ; GCN-NEXT: s_waitcnt vmcnt(13)
38422 ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
38423 ; GCN-NEXT: s_waitcnt vmcnt(12)
38424 ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
38425 ; GCN-NEXT: s_waitcnt vmcnt(11)
38426 ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
38427 ; GCN-NEXT: s_waitcnt vmcnt(10)
38428 ; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58
38429 ; GCN-NEXT: s_waitcnt vmcnt(9)
38430 ; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59
38431 ; GCN-NEXT: s_waitcnt vmcnt(8)
38432 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
38433 ; GCN-NEXT: s_waitcnt vmcnt(7)
38434 ; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
38435 ; GCN-NEXT: s_waitcnt vmcnt(6)
38436 ; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
38437 ; GCN-NEXT: s_waitcnt vmcnt(5)
38438 ; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
38439 ; GCN-NEXT: s_waitcnt vmcnt(4)
38440 ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
38441 ; GCN-NEXT: s_waitcnt vmcnt(3)
38442 ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
38443 ; GCN-NEXT: s_waitcnt vmcnt(2)
38444 ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
38445 ; GCN-NEXT: s_waitcnt vmcnt(1)
38446 ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
38447 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
38448 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
38449 ; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37
38450 ; GCN-NEXT: s_waitcnt vmcnt(0)
38451 ; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40
38452 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
38453 ; GCN-NEXT: v_cndmask_b32_e32 v12, v31, v13, vcc
38454 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
38455 ; GCN-NEXT: v_cndmask_b32_e32 v11, v52, v51, vcc
38456 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
38457 ; GCN-NEXT: v_cndmask_b32_e32 v10, v50, v49, vcc
38458 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
38459 ; GCN-NEXT: v_cndmask_b32_e32 v9, v35, v34, vcc
38460 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
38461 ; GCN-NEXT: v_cndmask_b32_e32 v8, v33, v32, vcc
38462 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
38463 ; GCN-NEXT: v_cndmask_b32_e32 v7, v59, v58, vcc
38464 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
38465 ; GCN-NEXT: v_cndmask_b32_e32 v6, v57, v56, vcc
38466 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
38467 ; GCN-NEXT: v_cndmask_b32_e32 v5, v47, v46, vcc
38468 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
38469 ; GCN-NEXT: v_cndmask_b32_e32 v4, v48, v39, vcc
38470 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
38471 ; GCN-NEXT: v_cndmask_b32_e32 v3, v45, v44, vcc
38472 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
38473 ; GCN-NEXT: v_cndmask_b32_e32 v2, v43, v42, vcc
38474 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38475 ; GCN-NEXT: v_cndmask_b32_e32 v1, v41, v55, vcc
38476 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38477 ; GCN-NEXT: v_cndmask_b32_e32 v0, v54, v53, vcc
38478 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
38479 ; GCN-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc
38480 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
38481 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
38482 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
38483 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
38484 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
38485 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
38486 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
38487 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
38488 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
38489 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
38490 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
38491 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
38492 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
38493 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38
38494 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
38495 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
38496 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
38497 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
38498 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
38499 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
38500 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
38501 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
38502 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
38503 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
38504 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
38505 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
38506 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36
38507 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
38508 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
38509 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
38510 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
38511 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
38512 ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
38513 ; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
38514 ; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
38515 ; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
38516 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
38517 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
38518 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
38519 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
38520 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
38521 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
38522 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
38523 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
38524 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
38525 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
38526 ; GCN-NEXT: s_waitcnt vmcnt(0)
38527 ; GCN-NEXT: s_setpc_b64 s[30:31]
38529 ; GFX7-LABEL: v_vselect_v32bf16:
38531 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38532 ; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
38533 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
38534 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32
38535 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228
38536 ; GFX7-NEXT: v_and_b32_e32 v25, 1, v25
38537 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25
38538 ; GFX7-NEXT: v_and_b32_e32 v30, 1, v30
38539 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30
38540 ; GFX7-NEXT: v_and_b32_e32 v29, 1, v29
38541 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29
38542 ; GFX7-NEXT: v_and_b32_e32 v28, 1, v28
38543 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28
38544 ; GFX7-NEXT: v_and_b32_e32 v27, 1, v27
38545 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27
38546 ; GFX7-NEXT: v_and_b32_e32 v26, 1, v26
38547 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26
38548 ; GFX7-NEXT: v_and_b32_e32 v23, 1, v23
38549 ; GFX7-NEXT: v_and_b32_e32 v22, 1, v22
38550 ; GFX7-NEXT: v_and_b32_e32 v21, 1, v21
38551 ; GFX7-NEXT: v_and_b32_e32 v20, 1, v20
38552 ; GFX7-NEXT: v_and_b32_e32 v19, 1, v19
38553 ; GFX7-NEXT: v_and_b32_e32 v18, 1, v18
38554 ; GFX7-NEXT: v_and_b32_e32 v17, 1, v17
38555 ; GFX7-NEXT: v_and_b32_e32 v16, 1, v16
38556 ; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
38557 ; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
38558 ; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
38559 ; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
38560 ; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
38561 ; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
38562 ; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
38563 ; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
38564 ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
38565 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
38566 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
38567 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
38568 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
38569 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
38570 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
38571 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
38572 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252
38573 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
38574 ; GFX7-NEXT: s_waitcnt vmcnt(3)
38575 ; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
38576 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
38577 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124
38578 ; GFX7-NEXT: s_waitcnt vmcnt(3)
38579 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38580 ; GFX7-NEXT: s_waitcnt vmcnt(2)
38581 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38582 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38583 ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
38584 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38585 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38586 ; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13]
38587 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120
38588 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248
38589 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
38590 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38591 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38592 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38593 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38594 ; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15]
38595 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
38596 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244
38597 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
38598 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38599 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38600 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38601 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38602 ; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17]
38603 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112
38604 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240
38605 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
38606 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38607 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38608 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38609 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38610 ; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11]
38611 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108
38612 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236
38613 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
38614 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38615 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38616 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38617 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38618 ; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9]
38619 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104
38620 ; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232
38621 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
38622 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38623 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38624 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38625 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
38626 ; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7]
38627 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128
38628 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
38629 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38630 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38631 ; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5]
38632 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
38633 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
38634 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38635 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
38636 ; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc
38637 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23
38638 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
38639 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
38640 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
38641 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38642 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
38643 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38644 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38645 ; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc
38646 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
38647 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
38648 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220
38649 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
38650 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38651 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
38652 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38653 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38654 ; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc
38655 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
38656 ; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88
38657 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216
38658 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
38659 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38660 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
38661 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38662 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38663 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc
38664 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
38665 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
38666 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212
38667 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
38668 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38669 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
38670 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38671 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38672 ; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc
38673 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
38674 ; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
38675 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208
38676 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
38677 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38678 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
38679 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38680 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38681 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc
38682 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
38683 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
38684 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204
38685 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
38686 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38687 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
38688 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38689 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38690 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc
38691 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
38692 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
38693 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200
38694 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
38695 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38696 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
38697 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38698 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38699 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc
38700 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
38701 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
38702 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196
38703 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
38704 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38705 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
38706 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38707 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38708 ; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc
38709 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
38710 ; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
38711 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192
38712 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
38713 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38714 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
38715 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38716 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38717 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc
38718 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
38719 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
38720 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188
38721 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
38722 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38723 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
38724 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38725 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38726 ; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc
38727 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
38728 ; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
38729 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184
38730 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
38731 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38732 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
38733 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38734 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38735 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc
38736 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
38737 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
38738 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180
38739 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
38740 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38741 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
38742 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38743 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38744 ; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc
38745 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
38746 ; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
38747 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176
38748 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
38749 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38750 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
38751 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38752 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38753 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc
38754 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
38755 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
38756 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172
38757 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
38758 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38759 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
38760 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38761 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38762 ; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc
38763 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
38764 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
38765 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168
38766 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
38767 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38768 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
38769 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38770 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38771 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc
38772 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
38773 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
38774 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164
38775 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
38776 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38777 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
38778 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38779 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38780 ; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc
38781 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
38782 ; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
38783 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160
38784 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
38785 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38786 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
38787 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38788 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38789 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc
38790 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
38791 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
38792 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156
38793 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
38794 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38795 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
38796 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38797 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38798 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc
38799 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
38800 ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
38801 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152
38802 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
38803 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38804 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
38805 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38806 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38807 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc
38808 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
38809 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
38810 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148
38811 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
38812 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38813 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
38814 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38815 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38816 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc
38817 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
38818 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
38819 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
38820 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
38821 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38822 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
38823 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38824 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38825 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc
38826 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
38827 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
38828 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140
38829 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
38830 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38831 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
38832 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38833 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38834 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc
38835 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38836 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
38837 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
38838 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
38839 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38840 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
38841 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38842 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38843 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc
38844 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38845 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
38846 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
38847 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
38848 ; GFX7-NEXT: s_waitcnt vmcnt(1)
38849 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
38850 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38851 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
38852 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc
38853 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
38854 ; GFX7-NEXT: s_setpc_b64 s[30:31]
38856 ; GFX8-LABEL: v_vselect_v32bf16:
38858 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38859 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
38860 ; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
38861 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
38862 ; GFX8-NEXT: v_writelane_b32 v34, s30, 0
38863 ; GFX8-NEXT: v_writelane_b32 v34, s31, 1
38864 ; GFX8-NEXT: v_writelane_b32 v34, s34, 2
38865 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
38866 ; GFX8-NEXT: v_writelane_b32 v34, s35, 3
38867 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38868 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
38869 ; GFX8-NEXT: v_writelane_b32 v34, s36, 4
38870 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
38871 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
38872 ; GFX8-NEXT: v_writelane_b32 v34, s37, 5
38873 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
38874 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v3
38875 ; GFX8-NEXT: v_writelane_b32 v34, s38, 6
38876 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
38877 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
38878 ; GFX8-NEXT: v_writelane_b32 v34, s39, 7
38879 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
38880 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v5
38881 ; GFX8-NEXT: v_writelane_b32 v34, s40, 8
38882 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
38883 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
38884 ; GFX8-NEXT: v_writelane_b32 v34, s41, 9
38885 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
38886 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v7
38887 ; GFX8-NEXT: v_writelane_b32 v34, s42, 10
38888 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
38889 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v8
38890 ; GFX8-NEXT: v_writelane_b32 v34, s43, 11
38891 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
38892 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v9
38893 ; GFX8-NEXT: v_writelane_b32 v34, s44, 12
38894 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
38895 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
38896 ; GFX8-NEXT: v_writelane_b32 v34, s45, 13
38897 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
38898 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
38899 ; GFX8-NEXT: v_writelane_b32 v34, s46, 14
38900 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
38901 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
38902 ; GFX8-NEXT: v_writelane_b32 v34, s47, 15
38903 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
38904 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
38905 ; GFX8-NEXT: v_writelane_b32 v34, s48, 16
38906 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
38907 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
38908 ; GFX8-NEXT: v_writelane_b32 v34, s49, 17
38909 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
38910 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
38911 ; GFX8-NEXT: v_writelane_b32 v34, s50, 18
38912 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
38913 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v16
38914 ; GFX8-NEXT: v_writelane_b32 v34, s51, 19
38915 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
38916 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v17
38917 ; GFX8-NEXT: v_writelane_b32 v34, s52, 20
38918 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
38919 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v18
38920 ; GFX8-NEXT: v_writelane_b32 v34, s53, 21
38921 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
38922 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v19
38923 ; GFX8-NEXT: v_writelane_b32 v34, s54, 22
38924 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
38925 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
38926 ; GFX8-NEXT: v_writelane_b32 v34, s55, 23
38927 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
38928 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
38929 ; GFX8-NEXT: v_writelane_b32 v34, s56, 24
38930 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
38931 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
38932 ; GFX8-NEXT: v_writelane_b32 v34, s57, 25
38933 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0
38934 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
38935 ; GFX8-NEXT: v_writelane_b32 v34, s58, 26
38936 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0
38937 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
38938 ; GFX8-NEXT: v_writelane_b32 v34, s59, 27
38939 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0
38940 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
38941 ; GFX8-NEXT: v_writelane_b32 v34, s60, 28
38942 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0
38943 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
38944 ; GFX8-NEXT: v_writelane_b32 v34, s61, 29
38945 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
38946 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
38947 ; GFX8-NEXT: v_writelane_b32 v34, s62, 30
38948 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
38949 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
38950 ; GFX8-NEXT: v_writelane_b32 v34, s63, 31
38951 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
38952 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
38953 ; GFX8-NEXT: v_writelane_b32 v34, s64, 32
38954 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
38955 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v30
38956 ; GFX8-NEXT: v_writelane_b32 v34, s65, 33
38957 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0
38958 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32
38959 ; GFX8-NEXT: v_writelane_b32 v34, s66, 34
38960 ; GFX8-NEXT: v_writelane_b32 v34, s67, 35
38961 ; GFX8-NEXT: s_waitcnt vmcnt(0)
38962 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
38963 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0
38964 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
38965 ; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
38966 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
38967 ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
38968 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
38969 ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
38970 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
38971 ; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
38972 ; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
38973 ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
38974 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
38975 ; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
38976 ; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
38977 ; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
38978 ; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
38979 ; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
38980 ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
38981 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
38982 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
38983 ; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
38984 ; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
38985 ; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
38986 ; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
38987 ; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
38988 ; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
38989 ; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
38990 ; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
38991 ; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
38992 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
38993 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
38994 ; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128
38995 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
38996 ; GFX8-NEXT: s_waitcnt vmcnt(1)
38997 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29
38998 ; GFX8-NEXT: s_waitcnt vmcnt(0)
38999 ; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32
39000 ; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[66:67]
39001 ; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[64:65]
39002 ; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31
39003 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30
39004 ; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[62:63]
39005 ; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[60:61]
39006 ; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27
39007 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26
39008 ; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[58:59]
39009 ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57]
39010 ; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25
39011 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24
39012 ; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[54:55]
39013 ; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53]
39014 ; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23
39015 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22
39016 ; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[50:51]
39017 ; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49]
39018 ; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21
39019 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20
39020 ; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[46:47]
39021 ; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45]
39022 ; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19
39023 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18
39024 ; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[42:43]
39025 ; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41]
39026 ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17
39027 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16
39028 ; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[38:39]
39029 ; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37]
39030 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
39031 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14
39032 ; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[34:35]
39033 ; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31]
39034 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13
39035 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12
39036 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29]
39037 ; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
39038 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11
39039 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10
39040 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25]
39041 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
39042 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9
39043 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8
39044 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21]
39045 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
39046 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
39047 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6
39048 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17]
39049 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
39050 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
39051 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4
39052 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13]
39053 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
39054 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
39055 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2
39056 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9]
39057 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
39058 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
39059 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0
39060 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5]
39061 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
39062 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
39063 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39064 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
39065 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39066 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
39067 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9
39068 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39069 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39070 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11
39071 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13
39072 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15
39073 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17
39074 ; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39075 ; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39076 ; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39077 ; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39078 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19
39079 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21
39080 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23
39081 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v25
39082 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v27
39083 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31
39084 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32
39085 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28
39086 ; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39087 ; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39088 ; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39089 ; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39090 ; GFX8-NEXT: v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39091 ; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39092 ; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39093 ; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
39094 ; GFX8-NEXT: v_readlane_b32 s67, v34, 35
39095 ; GFX8-NEXT: v_readlane_b32 s66, v34, 34
39096 ; GFX8-NEXT: v_readlane_b32 s65, v34, 33
39097 ; GFX8-NEXT: v_readlane_b32 s64, v34, 32
39098 ; GFX8-NEXT: v_readlane_b32 s63, v34, 31
39099 ; GFX8-NEXT: v_readlane_b32 s62, v34, 30
39100 ; GFX8-NEXT: v_readlane_b32 s61, v34, 29
39101 ; GFX8-NEXT: v_readlane_b32 s60, v34, 28
39102 ; GFX8-NEXT: v_readlane_b32 s59, v34, 27
39103 ; GFX8-NEXT: v_readlane_b32 s58, v34, 26
39104 ; GFX8-NEXT: v_readlane_b32 s57, v34, 25
39105 ; GFX8-NEXT: v_readlane_b32 s56, v34, 24
39106 ; GFX8-NEXT: v_readlane_b32 s55, v34, 23
39107 ; GFX8-NEXT: v_readlane_b32 s54, v34, 22
39108 ; GFX8-NEXT: v_readlane_b32 s53, v34, 21
39109 ; GFX8-NEXT: v_readlane_b32 s52, v34, 20
39110 ; GFX8-NEXT: v_readlane_b32 s51, v34, 19
39111 ; GFX8-NEXT: v_readlane_b32 s50, v34, 18
39112 ; GFX8-NEXT: v_readlane_b32 s49, v34, 17
39113 ; GFX8-NEXT: v_readlane_b32 s48, v34, 16
39114 ; GFX8-NEXT: v_readlane_b32 s47, v34, 15
39115 ; GFX8-NEXT: v_readlane_b32 s46, v34, 14
39116 ; GFX8-NEXT: v_readlane_b32 s45, v34, 13
39117 ; GFX8-NEXT: v_readlane_b32 s44, v34, 12
39118 ; GFX8-NEXT: v_readlane_b32 s43, v34, 11
39119 ; GFX8-NEXT: v_readlane_b32 s42, v34, 10
39120 ; GFX8-NEXT: v_readlane_b32 s41, v34, 9
39121 ; GFX8-NEXT: v_readlane_b32 s40, v34, 8
39122 ; GFX8-NEXT: v_readlane_b32 s39, v34, 7
39123 ; GFX8-NEXT: v_readlane_b32 s38, v34, 6
39124 ; GFX8-NEXT: v_readlane_b32 s37, v34, 5
39125 ; GFX8-NEXT: v_readlane_b32 s36, v34, 4
39126 ; GFX8-NEXT: v_readlane_b32 s35, v34, 3
39127 ; GFX8-NEXT: v_readlane_b32 s34, v34, 2
39128 ; GFX8-NEXT: v_readlane_b32 s31, v34, 1
39129 ; GFX8-NEXT: v_readlane_b32 s30, v34, 0
39130 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
39131 ; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39132 ; GFX8-NEXT: s_mov_b64 exec, s[4:5]
39133 ; GFX8-NEXT: s_waitcnt vmcnt(0)
39134 ; GFX8-NEXT: s_setpc_b64 s[30:31]
39136 ; GFX9-LABEL: v_vselect_v32bf16:
39138 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39139 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
39140 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
39141 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
39142 ; GFX9-NEXT: v_writelane_b32 v33, s30, 0
39143 ; GFX9-NEXT: v_writelane_b32 v33, s31, 1
39144 ; GFX9-NEXT: v_writelane_b32 v33, s34, 2
39145 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
39146 ; GFX9-NEXT: v_writelane_b32 v33, s35, 3
39147 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
39148 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
39149 ; GFX9-NEXT: v_writelane_b32 v33, s36, 4
39150 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
39151 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
39152 ; GFX9-NEXT: v_writelane_b32 v33, s37, 5
39153 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
39154 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
39155 ; GFX9-NEXT: v_writelane_b32 v33, s38, 6
39156 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
39157 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
39158 ; GFX9-NEXT: v_writelane_b32 v33, s39, 7
39159 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
39160 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
39161 ; GFX9-NEXT: v_writelane_b32 v33, s40, 8
39162 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
39163 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
39164 ; GFX9-NEXT: v_writelane_b32 v33, s41, 9
39165 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
39166 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
39167 ; GFX9-NEXT: v_writelane_b32 v33, s42, 10
39168 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
39169 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
39170 ; GFX9-NEXT: v_writelane_b32 v33, s43, 11
39171 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
39172 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
39173 ; GFX9-NEXT: v_writelane_b32 v33, s44, 12
39174 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
39175 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
39176 ; GFX9-NEXT: v_writelane_b32 v33, s45, 13
39177 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
39178 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
39179 ; GFX9-NEXT: v_writelane_b32 v33, s46, 14
39180 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
39181 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
39182 ; GFX9-NEXT: v_writelane_b32 v33, s47, 15
39183 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
39184 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
39185 ; GFX9-NEXT: v_writelane_b32 v33, s48, 16
39186 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
39187 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
39188 ; GFX9-NEXT: v_writelane_b32 v33, s49, 17
39189 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
39190 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
39191 ; GFX9-NEXT: v_writelane_b32 v33, s50, 18
39192 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
39193 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
39194 ; GFX9-NEXT: v_writelane_b32 v33, s51, 19
39195 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
39196 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v19
39197 ; GFX9-NEXT: v_writelane_b32 v33, s52, 20
39198 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
39199 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
39200 ; GFX9-NEXT: v_writelane_b32 v33, s53, 21
39201 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
39202 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v21
39203 ; GFX9-NEXT: v_writelane_b32 v33, s54, 22
39204 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
39205 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
39206 ; GFX9-NEXT: v_writelane_b32 v33, s55, 23
39207 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
39208 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v23
39209 ; GFX9-NEXT: v_writelane_b32 v33, s56, 24
39210 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0
39211 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v22
39212 ; GFX9-NEXT: v_writelane_b32 v33, s57, 25
39213 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0
39214 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v25
39215 ; GFX9-NEXT: v_writelane_b32 v33, s58, 26
39216 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0
39217 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v24
39218 ; GFX9-NEXT: v_writelane_b32 v33, s59, 27
39219 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0
39220 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v27
39221 ; GFX9-NEXT: v_writelane_b32 v33, s60, 28
39222 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
39223 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v26
39224 ; GFX9-NEXT: v_writelane_b32 v33, s61, 29
39225 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
39226 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v29
39227 ; GFX9-NEXT: v_writelane_b32 v33, s62, 30
39228 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
39229 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v28
39230 ; GFX9-NEXT: v_writelane_b32 v33, s63, 31
39231 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
39232 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32
39233 ; GFX9-NEXT: v_writelane_b32 v33, s64, 32
39234 ; GFX9-NEXT: v_writelane_b32 v33, s65, 33
39235 ; GFX9-NEXT: v_writelane_b32 v33, s66, 34
39236 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
39237 ; GFX9-NEXT: v_writelane_b32 v33, s67, 35
39238 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
39239 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39240 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
39241 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0
39242 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v30
39243 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0
39244 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
39245 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
39246 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
39247 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
39248 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
39249 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
39250 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
39251 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
39252 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
39253 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
39254 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
39255 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
39256 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
39257 ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
39258 ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
39259 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
39260 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
39261 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
39262 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
39263 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
39264 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
39265 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
39266 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
39267 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
39268 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
39269 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
39270 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
39271 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
39272 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
39273 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60
39274 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
39275 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
39276 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39277 ; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[66:67]
39278 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32
39279 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
39280 ; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[64:65]
39281 ; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[62:63]
39282 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
39283 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
39284 ; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[60:61]
39285 ; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[58:59]
39286 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
39287 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
39288 ; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57]
39289 ; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[54:55]
39290 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
39291 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
39292 ; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53]
39293 ; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[50:51]
39294 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
39295 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
39296 ; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49]
39297 ; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[46:47]
39298 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
39299 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
39300 ; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45]
39301 ; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[42:43]
39302 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
39303 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
39304 ; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41]
39305 ; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[38:39]
39306 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
39307 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
39308 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37]
39309 ; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[34:35]
39310 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
39311 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
39312 ; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31]
39313 ; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29]
39314 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
39315 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
39316 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
39317 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25]
39318 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
39319 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
39320 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
39321 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21]
39322 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
39323 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
39324 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
39325 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17]
39326 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
39327 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
39328 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
39329 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13]
39330 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
39331 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
39332 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
39333 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9]
39334 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
39335 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
39336 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
39337 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5]
39338 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
39339 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
39340 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
39341 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
39342 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
39343 ; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4
39344 ; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4
39345 ; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4
39346 ; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4
39347 ; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4
39348 ; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4
39349 ; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4
39350 ; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4
39351 ; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4
39352 ; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4
39353 ; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4
39354 ; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4
39355 ; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4
39356 ; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4
39357 ; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4
39358 ; GFX9-NEXT: v_readlane_b32 s67, v33, 35
39359 ; GFX9-NEXT: v_readlane_b32 s66, v33, 34
39360 ; GFX9-NEXT: v_readlane_b32 s65, v33, 33
39361 ; GFX9-NEXT: v_readlane_b32 s64, v33, 32
39362 ; GFX9-NEXT: v_readlane_b32 s63, v33, 31
39363 ; GFX9-NEXT: v_readlane_b32 s62, v33, 30
39364 ; GFX9-NEXT: v_readlane_b32 s61, v33, 29
39365 ; GFX9-NEXT: v_readlane_b32 s60, v33, 28
39366 ; GFX9-NEXT: v_readlane_b32 s59, v33, 27
39367 ; GFX9-NEXT: v_readlane_b32 s58, v33, 26
39368 ; GFX9-NEXT: v_readlane_b32 s57, v33, 25
39369 ; GFX9-NEXT: v_readlane_b32 s56, v33, 24
39370 ; GFX9-NEXT: v_readlane_b32 s55, v33, 23
39371 ; GFX9-NEXT: v_readlane_b32 s54, v33, 22
39372 ; GFX9-NEXT: v_readlane_b32 s53, v33, 21
39373 ; GFX9-NEXT: v_readlane_b32 s52, v33, 20
39374 ; GFX9-NEXT: v_readlane_b32 s51, v33, 19
39375 ; GFX9-NEXT: v_readlane_b32 s50, v33, 18
39376 ; GFX9-NEXT: v_readlane_b32 s49, v33, 17
39377 ; GFX9-NEXT: v_readlane_b32 s48, v33, 16
39378 ; GFX9-NEXT: v_readlane_b32 s47, v33, 15
39379 ; GFX9-NEXT: v_readlane_b32 s46, v33, 14
39380 ; GFX9-NEXT: v_readlane_b32 s45, v33, 13
39381 ; GFX9-NEXT: v_readlane_b32 s44, v33, 12
39382 ; GFX9-NEXT: v_readlane_b32 s43, v33, 11
39383 ; GFX9-NEXT: v_readlane_b32 s42, v33, 10
39384 ; GFX9-NEXT: v_readlane_b32 s41, v33, 9
39385 ; GFX9-NEXT: v_readlane_b32 s40, v33, 8
39386 ; GFX9-NEXT: v_readlane_b32 s39, v33, 7
39387 ; GFX9-NEXT: v_readlane_b32 s38, v33, 6
39388 ; GFX9-NEXT: v_readlane_b32 s37, v33, 5
39389 ; GFX9-NEXT: v_readlane_b32 s36, v33, 4
39390 ; GFX9-NEXT: v_readlane_b32 s35, v33, 3
39391 ; GFX9-NEXT: v_readlane_b32 s34, v33, 2
39392 ; GFX9-NEXT: v_readlane_b32 s31, v33, 1
39393 ; GFX9-NEXT: v_readlane_b32 s30, v33, 0
39394 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
39395 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39396 ; GFX9-NEXT: s_mov_b64 exec, s[4:5]
39397 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39398 ; GFX9-NEXT: s_setpc_b64 s[30:31]
39400 ; GFX10-LABEL: v_vselect_v32bf16:
39402 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39403 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1
39404 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
39405 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
39406 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
39407 ; GFX10-NEXT: v_and_b32_e32 v29, 1, v29
39408 ; GFX10-NEXT: v_and_b32_e32 v30, 1, v30
39409 ; GFX10-NEXT: v_and_b32_e32 v28, 1, v28
39410 ; GFX10-NEXT: v_and_b32_e32 v26, 1, v26
39411 ; GFX10-NEXT: v_and_b32_e32 v24, 1, v24
39412 ; GFX10-NEXT: v_and_b32_e32 v22, 1, v22
39413 ; GFX10-NEXT: v_and_b32_e32 v20, 1, v20
39414 ; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
39415 ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
39416 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
39417 ; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
39418 ; GFX10-NEXT: s_clause 0x14
39419 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
39420 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
39421 ; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32
39422 ; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128
39423 ; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
39424 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
39425 ; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116
39426 ; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52
39427 ; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120
39428 ; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
39429 ; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32
39430 ; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100
39431 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36
39432 ; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
39433 ; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40
39434 ; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
39435 ; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44
39436 ; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112
39437 ; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72
39438 ; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76
39439 ; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80
39440 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
39441 ; GFX10-NEXT: s_clause 0x1
39442 ; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92
39443 ; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28
39444 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30
39445 ; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
39446 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28
39447 ; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
39448 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v26
39449 ; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v24
39450 ; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84
39451 ; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v22
39452 ; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
39453 ; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v20
39454 ; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
39455 ; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v18
39456 ; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
39457 ; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v16
39458 ; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
39459 ; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v14
39460 ; GFX10-NEXT: s_clause 0x1
39461 ; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
39462 ; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24
39463 ; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12
39464 ; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
39465 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0
39466 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
39467 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
39468 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
39469 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
39470 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
39471 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
39472 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1
39473 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
39474 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
39475 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
39476 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
39477 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
39478 ; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
39479 ; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
39480 ; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
39481 ; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
39482 ; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
39483 ; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
39484 ; GFX10-NEXT: v_and_b32_e32 v23, 1, v23
39485 ; GFX10-NEXT: v_and_b32_e32 v25, 1, v25
39486 ; GFX10-NEXT: v_and_b32_e32 v27, 1, v27
39487 ; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v10
39488 ; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v8
39489 ; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v6
39490 ; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
39491 ; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
39492 ; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
39493 ; GFX10-NEXT: v_writelane_b32 v40, s34, 2
39494 ; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
39495 ; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
39496 ; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
39497 ; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v21
39498 ; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v19
39499 ; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v17
39500 ; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v15
39501 ; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
39502 ; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
39503 ; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
39504 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
39505 ; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
39506 ; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
39507 ; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
39508 ; GFX10-NEXT: s_waitcnt vmcnt(32)
39509 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
39510 ; GFX10-NEXT: s_waitcnt vmcnt(31)
39511 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32
39512 ; GFX10-NEXT: s_waitcnt vmcnt(30)
39513 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v33
39514 ; GFX10-NEXT: s_waitcnt vmcnt(29)
39515 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34
39516 ; GFX10-NEXT: s_waitcnt vmcnt(28)
39517 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4
39518 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35
39519 ; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5
39520 ; GFX10-NEXT: s_waitcnt vmcnt(25)
39521 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7
39522 ; GFX10-NEXT: s_waitcnt vmcnt(24)
39523 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39
39524 ; GFX10-NEXT: s_waitcnt vmcnt(23)
39525 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6
39526 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48
39527 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38
39528 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37
39529 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36
39530 ; GFX10-NEXT: s_waitcnt vmcnt(18)
39531 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10
39532 ; GFX10-NEXT: s_waitcnt vmcnt(17)
39533 ; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54
39534 ; GFX10-NEXT: s_waitcnt vmcnt(16)
39535 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9
39536 ; GFX10-NEXT: s_waitcnt vmcnt(15)
39537 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8
39538 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64
39539 ; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55
39540 ; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53
39541 ; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52
39542 ; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11
39543 ; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51
39544 ; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50
39545 ; GFX10-NEXT: s_waitcnt vmcnt(9)
39546 ; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12
39547 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
39548 ; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
39549 ; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13
39550 ; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68
39551 ; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
39552 ; GFX10-NEXT: s_waitcnt vmcnt(6)
39553 ; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15
39554 ; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
39555 ; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
39556 ; GFX10-NEXT: s_waitcnt vmcnt(5)
39557 ; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16
39558 ; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
39559 ; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67
39560 ; GFX10-NEXT: s_waitcnt vmcnt(4)
39561 ; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17
39562 ; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
39563 ; GFX10-NEXT: s_waitcnt vmcnt(1)
39564 ; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14
39565 ; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
39566 ; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
39567 ; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66
39568 ; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18
39569 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
39570 ; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65
39571 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39572 ; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19
39573 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
39574 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
39575 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
39576 ; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo
39577 ; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20
39578 ; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21
39579 ; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22
39580 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23
39581 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24
39582 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25
39583 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26
39584 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
39585 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
39586 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
39587 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
39588 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
39589 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
39590 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
39591 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
39592 ; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
39593 ; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
39594 ; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100
39595 ; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100
39596 ; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100
39597 ; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100
39598 ; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100
39599 ; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100
39600 ; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100
39601 ; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100
39602 ; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100
39603 ; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100
39604 ; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100
39605 ; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
39606 ; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
39607 ; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
39608 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2
39609 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1
39610 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0
39611 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1
39612 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
39613 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
39614 ; GFX10-NEXT: s_mov_b32 exec_lo, s4
39615 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39616 ; GFX10-NEXT: s_setpc_b64 s[30:31]
39618 ; GFX11TRUE16-LABEL: v_vselect_v32bf16:
39619 ; GFX11TRUE16: ; %bb.0:
39620 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39621 ; GFX11TRUE16-NEXT: s_clause 0x1f
39622 ; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32
39623 ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
39624 ; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
39625 ; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
39626 ; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
39627 ; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
39628 ; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
39629 ; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
39630 ; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
39631 ; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
39632 ; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
39633 ; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
39634 ; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
39635 ; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
39636 ; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
39637 ; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
39638 ; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
39639 ; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
39640 ; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
39641 ; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
39642 ; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
39643 ; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
39644 ; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
39645 ; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
39646 ; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
39647 ; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
39648 ; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
39649 ; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
39650 ; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
39651 ; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
39652 ; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
39653 ; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
39654 ; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
39655 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
39656 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v96.l, v32.l
39657 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
39658 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v97.l, v33.l
39659 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
39660 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v98.l, v34.l
39661 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
39662 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v99.l, v35.l
39663 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
39664 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v100.l, v36.l
39665 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
39666 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v101.l, v37.l
39667 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
39668 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v102.l, v38.l
39669 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
39670 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v103.l, v39.l
39671 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
39672 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v112.l, v48.l
39673 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
39674 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v113.l, v49.l
39675 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
39676 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v114.l, v50.l
39677 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
39678 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v115.l, v51.l
39679 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
39680 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v116.l, v52.l
39681 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
39682 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v117.l, v53.l
39683 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
39684 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v118.l, v54.l
39685 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
39686 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v119.l, v55.l
39687 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15)
39688 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v128.l, v64.l
39689 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
39690 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v129.l, v65.l
39691 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(13)
39692 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v130.l, v66.l
39693 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
39694 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v131.l, v67.l
39695 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11)
39696 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v132.l, v68.l
39697 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
39698 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v133.l, v69.l
39699 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9)
39700 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v134.l, v70.l
39701 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
39702 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v135.l, v71.l
39703 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71
39704 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70
39705 ; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
39706 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5)
39707 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v146.l, v82.l
39708 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4)
39709 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v147.l, v83.l
39710 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83
39711 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
39712 ; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
39713 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
39714 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
39715 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
39716 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v84.l
39717 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84
39718 ; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
39719 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v96, v96, v97, vcc_lo
39720 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
39721 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
39722 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
39723 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v97.l, v85.l
39724 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85
39725 ; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
39726 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v98, v98, v99, vcc_lo
39727 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
39728 ; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
39729 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v144.l, v80.l
39730 ; GFX11TRUE16-NEXT: v_mov_b16_e64 v145.l, v81.l
39731 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81
39732 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v100, v101, vcc_lo
39733 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80
39734 ; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
39735 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
39736 ; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
39737 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
39738 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
39739 ; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
39740 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v102, v103, vcc_lo
39741 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
39742 ; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
39743 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
39744 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66
39745 ; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
39746 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v112, v113, vcc_lo
39747 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
39748 ; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
39749 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
39750 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64
39751 ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
39752 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v114, v115, vcc_lo
39753 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
39754 ; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
39755 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55
39756 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
39757 ; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
39758 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v116, v117, vcc_lo
39759 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
39760 ; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
39761 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
39762 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52
39763 ; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
39764 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v118, v119, vcc_lo
39765 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
39766 ; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
39767 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51
39768 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50
39769 ; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
39770 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v128, v129, vcc_lo
39771 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
39772 ; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
39773 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
39774 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
39775 ; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
39776 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v130, v131, vcc_lo
39777 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
39778 ; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
39779 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
39780 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
39781 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
39782 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v132, v133, vcc_lo
39783 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
39784 ; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
39785 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39786 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39787 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
39788 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v134, v135, vcc_lo
39789 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
39790 ; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
39791 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
39792 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
39793 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
39794 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v144, v145, vcc_lo
39795 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
39796 ; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
39797 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
39798 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
39799 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
39800 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v146, v147, vcc_lo
39801 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
39802 ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
39803 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
39804 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v86.l
39805 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
39806 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v99.l, v87.l
39807 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87
39808 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v30, v97, vcc_lo
39809 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
39810 ; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
39811 ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86
39812 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v84.l
39813 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v84.l, v85.l
39814 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v28, v99, vcc_lo
39815 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
39816 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v86.l
39817 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v85.l, v87.l
39818 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo
39819 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
39820 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo
39821 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
39822 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
39823 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
39824 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo
39825 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
39826 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo
39827 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
39828 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo
39829 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
39830 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo
39831 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
39832 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo
39833 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
39834 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo
39835 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
39836 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo
39837 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
39838 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo
39839 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
39840 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
39841 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
39842 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v30, v84, vcc_lo
39843 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
39844 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v28, v85, vcc_lo
39845 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
39846 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
39847 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
39848 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo
39849 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
39850 ; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
39851 ; GFX11TRUE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
39852 ; GFX11TRUE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
39853 ; GFX11TRUE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
39854 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo
39855 ; GFX11TRUE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
39856 ; GFX11TRUE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
39857 ; GFX11TRUE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
39858 ; GFX11TRUE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100
39859 ; GFX11TRUE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
39860 ; GFX11TRUE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
39861 ; GFX11TRUE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100
39862 ; GFX11TRUE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100
39863 ; GFX11TRUE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100
39864 ; GFX11TRUE16-NEXT: v_perm_b32 v14, v29, v98, 0x5040100
39865 ; GFX11TRUE16-NEXT: v_perm_b32 v15, v31, v96, 0x5040100
39866 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
39868 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
39869 ; GFX11FAKE16: ; %bb.0:
39870 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39871 ; GFX11FAKE16-NEXT: s_clause 0x1f
39872 ; GFX11FAKE16-NEXT: scratch_load_u16 v31, off, s32
39873 ; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
39874 ; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
39875 ; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
39876 ; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
39877 ; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
39878 ; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
39879 ; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
39880 ; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
39881 ; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
39882 ; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
39883 ; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
39884 ; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
39885 ; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
39886 ; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
39887 ; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
39888 ; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
39889 ; GFX11FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
39890 ; GFX11FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
39891 ; GFX11FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
39892 ; GFX11FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
39893 ; GFX11FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
39894 ; GFX11FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
39895 ; GFX11FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
39896 ; GFX11FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
39897 ; GFX11FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
39898 ; GFX11FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
39899 ; GFX11FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
39900 ; GFX11FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
39901 ; GFX11FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
39902 ; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
39903 ; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
39904 ; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
39905 ; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30
39906 ; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28
39907 ; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26
39908 ; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24
39909 ; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22
39910 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
39911 ; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
39912 ; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20
39913 ; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18
39914 ; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16
39915 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30)
39916 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
39917 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
39918 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
39919 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
39920 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
39921 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
39922 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28)
39923 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo
39924 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
39925 ; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7
39926 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
39927 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
39928 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
39929 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26)
39930 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
39931 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
39932 ; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
39933 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
39934 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
39935 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
39936 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24)
39937 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo
39938 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
39939 ; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11
39940 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
39941 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
39942 ; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
39943 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22)
39944 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo
39945 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
39946 ; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
39947 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
39948 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
39949 ; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
39950 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20)
39951 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo
39952 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
39953 ; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
39954 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51
39955 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50
39956 ; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
39957 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18)
39958 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo
39959 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
39960 ; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
39961 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
39962 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52
39963 ; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
39964 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16)
39965 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo
39966 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55
39967 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
39968 ; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
39969 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
39970 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
39971 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14)
39972 ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
39973 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
39974 ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17
39975 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
39976 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64
39977 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12)
39978 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo
39979 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
39980 ; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23
39981 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
39982 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66
39983 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10)
39984 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo
39985 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
39986 ; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21
39987 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
39988 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
39989 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8)
39990 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo
39991 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
39992 ; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27
39993 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71
39994 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70
39995 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6)
39996 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo
39997 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
39998 ; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25
39999 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81
40000 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80
40001 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4)
40002 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo
40003 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
40004 ; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31
40005 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83
40006 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
40007 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2)
40008 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo
40009 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
40010 ; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29
40011 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85
40012 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84
40013 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
40014 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo
40015 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
40016 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87
40017 ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86
40018 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo
40019 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
40020 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo
40021 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
40022 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
40023 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
40024 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo
40025 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
40026 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo
40027 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
40028 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo
40029 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
40030 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo
40031 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
40032 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo
40033 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
40034 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo
40035 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
40036 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo
40037 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
40038 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo
40039 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
40040 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
40041 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
40042 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo
40043 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
40044 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo
40045 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
40046 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
40047 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
40048 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo
40049 ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
40050 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
40051 ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
40052 ; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
40053 ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
40054 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo
40055 ; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
40056 ; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
40057 ; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
40058 ; GFX11FAKE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100
40059 ; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
40060 ; GFX11FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
40061 ; GFX11FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100
40062 ; GFX11FAKE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100
40063 ; GFX11FAKE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100
40064 ; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100
40065 ; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100
40066 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
40067 %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
40068 ret <32 x bfloat> %op
40071 declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
40072 declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
40073 declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
40074 declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
40076 define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
40077 ; GCN-LABEL: v_fma_bf16:
40079 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40080 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40081 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40082 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40083 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40084 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40085 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40086 ; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
40087 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40088 ; GCN-NEXT: s_setpc_b64 s[30:31]
40090 ; GFX7-LABEL: v_fma_bf16:
40092 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40093 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40094 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40095 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40096 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40097 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40098 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40099 ; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
40100 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40101 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40103 ; GFX8-LABEL: v_fma_bf16:
40105 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40106 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40107 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40108 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40109 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
40110 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40111 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40112 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40113 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
40114 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40115 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40116 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40117 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40119 ; GFX9-LABEL: v_fma_bf16:
40121 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40122 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40123 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40124 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40125 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
40126 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40127 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40128 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40129 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
40130 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40131 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40132 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40133 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40135 ; GFX10-LABEL: v_fma_bf16:
40137 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40138 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40139 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40140 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40141 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
40142 ; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1
40143 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2
40144 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40145 ; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
40146 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
40147 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40148 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40150 ; GFX11-LABEL: v_fma_bf16:
40152 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40153 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
40154 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40155 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40156 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40157 ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
40158 ; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
40159 ; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v2
40160 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40161 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
40162 ; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
40163 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
40164 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40165 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40166 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40167 %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
40171 define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
40172 ; GCN-LABEL: v_fma_v2bf16:
40174 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40175 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40176 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40177 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40178 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40179 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40180 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40181 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40182 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40183 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40184 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40185 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40186 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40187 ; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
40188 ; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
40189 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40190 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40191 ; GCN-NEXT: s_setpc_b64 s[30:31]
40193 ; GFX7-LABEL: v_fma_v2bf16:
40195 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40196 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40197 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
40198 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
40199 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40200 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40201 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
40202 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40203 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40204 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40205 ; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
40206 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
40207 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40208 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40209 ; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3
40210 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40211 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40212 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40214 ; GFX8-LABEL: v_fma_v2bf16:
40216 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40217 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40218 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40219 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40220 ; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
40221 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
40222 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
40223 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40224 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40225 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40226 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
40227 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
40228 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
40229 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40230 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40231 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
40232 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40233 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40234 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
40235 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40236 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40237 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40238 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
40239 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40241 ; GFX9-LABEL: v_fma_v2bf16:
40243 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40244 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40245 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40246 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40247 ; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
40248 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40249 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40250 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40251 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
40252 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40253 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
40254 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
40255 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
40256 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40257 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40258 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
40259 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40260 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
40261 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40262 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40263 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
40264 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
40265 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40267 ; GFX10-LABEL: v_fma_v2bf16:
40269 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40270 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40271 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40272 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40273 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40274 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40275 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40276 ; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
40277 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
40278 ; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
40279 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
40280 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
40281 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
40282 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
40283 ; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
40284 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
40285 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
40286 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40287 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
40288 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
40289 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40291 ; GFX11-LABEL: v_fma_v2bf16:
40293 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40294 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
40295 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
40296 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40297 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40298 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
40299 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40300 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40301 ; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
40302 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1
40303 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
40304 ; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1
40305 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
40306 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
40307 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
40308 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
40309 ; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
40310 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
40311 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
40312 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
40313 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
40314 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40315 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
40316 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40317 %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
40318 ret <2 x bfloat> %op
40321 define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
40322 ; GCN-LABEL: v_fma_v3bf16:
40324 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40325 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40326 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40327 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
40328 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40329 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40330 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
40331 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40332 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40333 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
40334 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
40335 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40336 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40337 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
40338 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40339 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40340 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
40341 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40342 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40343 ; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
40344 ; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
40345 ; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
40346 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40347 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40348 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40349 ; GCN-NEXT: s_setpc_b64 s[30:31]
40351 ; GFX7-LABEL: v_fma_v3bf16:
40353 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40354 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40355 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
40356 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
40357 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40358 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
40359 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
40360 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
40361 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40362 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40363 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40364 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
40365 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
40366 ; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8
40367 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
40368 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40369 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40370 ; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5
40371 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
40372 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40373 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40374 ; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4
40375 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40376 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40377 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40378 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40380 ; GFX8-LABEL: v_fma_v3bf16:
40382 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40383 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
40384 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40385 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40386 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
40387 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
40388 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
40389 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
40390 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
40391 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40392 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40393 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40394 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40395 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
40396 ; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3
40397 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
40398 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
40399 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
40400 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40401 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40402 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40403 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
40404 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
40405 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
40406 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40407 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
40408 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
40409 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
40410 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
40411 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
40412 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40413 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40414 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40415 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
40416 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
40417 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40419 ; GFX9-LABEL: v_fma_v3bf16:
40421 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40422 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
40423 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40424 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40425 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
40426 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
40427 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40428 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
40429 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
40430 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40431 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40432 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40433 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40434 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0
40435 ; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3
40436 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40437 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40438 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40439 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
40440 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
40441 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
40442 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
40443 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40444 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
40445 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
40446 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
40447 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
40448 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40449 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40450 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
40451 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
40452 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
40453 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40455 ; GFX10-LABEL: v_fma_v3bf16:
40457 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40458 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
40459 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
40460 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
40461 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
40462 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40463 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40464 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40465 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40466 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40467 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
40468 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
40469 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
40470 ; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
40471 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
40472 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40473 ; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1
40474 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
40475 ; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
40476 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
40477 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
40478 ; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
40479 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
40480 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40481 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40482 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40483 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40484 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40485 ; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
40486 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16
40487 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40489 ; GFX11TRUE16-LABEL: v_fma_v3bf16:
40490 ; GFX11TRUE16: ; %bb.0:
40491 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40492 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
40493 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
40494 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
40495 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40496 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40497 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40498 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40499 ; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
40500 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
40501 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v0, v2
40502 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40503 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
40504 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
40505 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
40506 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
40507 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40508 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40509 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
40510 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
40511 ; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
40512 ; GFX11TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1
40513 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
40514 ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
40515 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
40516 ; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
40517 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40518 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40519 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40520 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40521 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40522 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40523 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
40524 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
40525 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v3, 16
40526 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
40528 ; GFX11FAKE16-LABEL: v_fma_v3bf16:
40529 ; GFX11FAKE16: ; %bb.0:
40530 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40531 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
40532 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
40533 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
40534 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40535 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40536 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40537 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
40538 ; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
40539 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
40540 ; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
40541 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40542 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
40543 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
40544 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
40545 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
40546 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40547 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40548 ; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
40549 ; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1
40550 ; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
40551 ; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1
40552 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
40553 ; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
40554 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
40555 ; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
40556 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40557 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
40558 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40559 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
40560 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40561 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
40562 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
40563 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
40564 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
40565 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
40566 %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
40567 ret <3 x bfloat> %op
40570 define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
40571 ; GCN-LABEL: v_fma_v4bf16:
40573 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40574 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40575 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40576 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
40577 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40578 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40579 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
40580 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40581 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
40582 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
40583 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40584 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
40585 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
40586 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
40587 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
40588 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40589 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
40590 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
40591 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40592 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
40593 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40594 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40595 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
40596 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40597 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40598 ; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
40599 ; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
40600 ; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
40601 ; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
40602 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40603 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40604 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40605 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40606 ; GCN-NEXT: s_setpc_b64 s[30:31]
40608 ; GFX7-LABEL: v_fma_v4bf16:
40610 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40611 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
40612 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
40613 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
40614 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40615 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
40616 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
40617 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
40618 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
40619 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40620 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40621 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
40622 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
40623 ; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11
40624 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
40625 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
40626 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40627 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40628 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
40629 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
40630 ; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7
40631 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
40632 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40633 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40634 ; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6
40635 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
40636 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40637 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40638 ; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5
40639 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40640 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40641 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40642 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40643 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40645 ; GFX8-LABEL: v_fma_v4bf16:
40647 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40648 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40649 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40650 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40651 ; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
40652 ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
40653 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
40654 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40655 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40656 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40657 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
40658 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
40659 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
40660 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
40661 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
40662 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
40663 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
40664 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
40665 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
40666 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
40667 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40668 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40669 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40670 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40671 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
40672 ; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
40673 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
40674 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
40675 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40676 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40677 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40678 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
40679 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
40680 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
40681 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40682 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
40683 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
40684 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
40685 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
40686 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
40687 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40688 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40689 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
40690 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40691 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
40692 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
40693 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40695 ; GFX9-LABEL: v_fma_v4bf16:
40697 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40698 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40699 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40700 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40701 ; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
40702 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40703 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40704 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40705 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
40706 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40707 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
40708 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
40709 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
40710 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
40711 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
40712 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
40713 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
40714 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
40715 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
40716 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
40717 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
40718 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
40719 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
40720 ; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
40721 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40722 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40723 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40724 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
40725 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
40726 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
40727 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
40728 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
40729 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
40730 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
40731 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
40732 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
40733 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40734 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
40735 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
40736 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
40737 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
40738 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40740 ; GFX10-LABEL: v_fma_v4bf16:
40742 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40743 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40744 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40745 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40746 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40747 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40748 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40749 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
40750 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
40751 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
40752 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
40753 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40754 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40755 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40756 ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
40757 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
40758 ; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
40759 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
40760 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
40761 ; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
40762 ; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
40763 ; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
40764 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40765 ; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
40766 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
40767 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
40768 ; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
40769 ; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
40770 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7
40771 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
40772 ; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
40773 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
40774 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
40775 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40776 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
40777 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40778 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
40779 ; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
40780 ; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
40781 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40783 ; GFX11-LABEL: v_fma_v4bf16:
40785 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40786 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1
40787 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40788 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
40789 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40790 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3
40791 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40792 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5
40793 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40794 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
40795 ; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3
40796 ; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
40797 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40798 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
40799 ; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
40800 ; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v6
40801 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
40802 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
40803 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40804 ; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2
40805 ; GFX11-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
40806 ; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
40807 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
40808 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
40809 ; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8
40810 ; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
40811 ; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
40812 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
40813 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
40814 ; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1
40815 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
40816 ; GFX11-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
40817 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
40818 ; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
40819 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7
40820 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
40821 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
40822 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
40823 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
40824 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
40825 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
40826 ; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
40827 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
40828 ; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
40829 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40830 %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
40831 ret <4 x bfloat> %op
40834 declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
40835 declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
40836 declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
40837 declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
40839 define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
40840 ; GCN-LABEL: v_fmuladd_bf16:
40842 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40843 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40844 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40845 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40846 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40847 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40848 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
40849 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40850 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
40851 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1
40852 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40853 ; GCN-NEXT: s_setpc_b64 s[30:31]
40855 ; GFX7-LABEL: v_fmuladd_bf16:
40857 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40858 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40859 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
40860 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40861 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40862 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40863 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
40864 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40865 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
40866 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
40867 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40868 ; GFX7-NEXT: s_setpc_b64 s[30:31]
40870 ; GFX8-LABEL: v_fmuladd_bf16:
40872 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40873 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40874 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40875 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
40876 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40877 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40878 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40879 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
40880 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40881 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
40882 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40883 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
40884 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
40885 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
40886 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
40887 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
40888 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
40889 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40890 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40891 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40892 ; GFX8-NEXT: s_setpc_b64 s[30:31]
40894 ; GFX9-LABEL: v_fmuladd_bf16:
40896 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40897 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40898 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40899 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
40900 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40901 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
40902 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40903 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
40904 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40905 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
40906 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40907 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
40908 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
40909 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
40910 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
40911 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
40912 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
40913 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
40914 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40915 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40917 ; GFX10-LABEL: v_fmuladd_bf16:
40919 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40920 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40921 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40922 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
40923 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
40924 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
40925 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40926 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40927 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
40928 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
40929 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40930 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
40931 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
40932 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
40933 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40934 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40935 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
40936 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40937 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40939 ; GFX11-LABEL: v_fmuladd_bf16:
40941 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40942 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
40943 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
40944 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40945 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
40946 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
40947 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
40948 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40949 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
40950 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40951 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
40952 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40953 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40954 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
40955 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
40956 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
40957 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
40958 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
40959 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
40960 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40961 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
40962 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
40963 ; GFX11-NEXT: s_setpc_b64 s[30:31]
40964 %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
40968 define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
40969 ; GCN-LABEL: v_fmuladd_v2bf16:
40971 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40972 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
40973 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
40974 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
40975 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
40976 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
40977 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
40978 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
40979 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40980 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
40981 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
40982 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40983 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
40984 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
40985 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
40986 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40987 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40988 ; GCN-NEXT: v_add_f32_e32 v1, v1, v5
40989 ; GCN-NEXT: v_add_f32_e32 v0, v0, v4
40990 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
40991 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
40992 ; GCN-NEXT: s_setpc_b64 s[30:31]
40994 ; GFX7-LABEL: v_fmuladd_v2bf16:
40996 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40997 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
40998 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
40999 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
41000 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
41001 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41002 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41003 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41004 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41005 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
41006 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
41007 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
41008 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
41009 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41010 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
41011 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41012 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41013 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
41014 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
41015 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41016 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41017 ; GFX7-NEXT: s_setpc_b64 s[30:31]
41019 ; GFX8-LABEL: v_fmuladd_v2bf16:
41021 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41022 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41023 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41024 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
41025 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
41026 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
41027 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
41028 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
41029 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41030 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41031 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41032 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
41033 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
41034 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
41035 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
41036 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
41037 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41038 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41039 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
41040 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
41041 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
41042 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41043 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
41044 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41045 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
41046 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
41047 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
41048 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41049 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
41050 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41051 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
41052 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
41053 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
41054 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
41055 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
41056 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
41057 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41058 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
41059 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
41060 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
41061 ; GFX8-NEXT: s_setpc_b64 s[30:31]
41063 ; GFX9-LABEL: v_fmuladd_v2bf16:
41065 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41066 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41067 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41068 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
41069 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
41070 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
41071 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
41072 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
41073 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41074 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41075 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41076 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
41077 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
41078 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41079 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41080 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
41081 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
41082 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
41083 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
41084 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41085 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
41086 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
41087 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
41088 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
41089 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41090 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
41091 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41092 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
41093 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
41094 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
41095 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
41096 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
41097 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41098 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
41099 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
41100 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
41101 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41103 ; GFX10-LABEL: v_fmuladd_v2bf16:
41105 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41106 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41107 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41108 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41109 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41110 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
41111 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
41112 ; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
41113 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
41114 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
41115 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41116 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
41117 ; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
41118 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41119 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
41120 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41121 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
41122 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41123 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41124 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
41125 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
41126 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41127 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
41128 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
41129 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
41130 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41131 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
41132 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41133 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
41134 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
41135 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
41136 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41137 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
41138 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
41139 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41141 ; GFX11-LABEL: v_fmuladd_v2bf16:
41143 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41144 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
41145 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
41146 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41147 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
41148 ; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
41149 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
41150 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41151 ; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
41152 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
41153 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41154 ; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
41155 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41156 ; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
41157 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
41158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
41159 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
41160 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41161 ; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
41162 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41163 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41164 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41165 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
41166 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
41167 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41168 ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
41169 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41171 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
41172 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
41173 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
41174 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41175 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41176 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
41177 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
41178 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41179 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
41180 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41181 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
41182 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
41183 ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
41184 ; GFX11-NEXT: s_setpc_b64 s[30:31]
41185 %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
41186 ret <2 x bfloat> %op
41189 define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
41190 ; GCN-LABEL: v_fmuladd_v3bf16:
41192 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41193 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
41194 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
41195 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
41196 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
41197 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
41198 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
41199 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
41200 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
41201 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
41202 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41203 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41204 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
41205 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41206 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41207 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
41208 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41209 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41210 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41211 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
41212 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
41213 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
41214 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41215 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41216 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41217 ; GCN-NEXT: v_add_f32_e32 v2, v2, v8
41218 ; GCN-NEXT: v_add_f32_e32 v1, v1, v7
41219 ; GCN-NEXT: v_add_f32_e32 v0, v0, v6
41220 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41221 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41222 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41223 ; GCN-NEXT: s_setpc_b64 s[30:31]
41225 ; GFX7-LABEL: v_fmuladd_v3bf16:
41227 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41228 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
41229 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
41230 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
41231 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
41232 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
41233 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
41234 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41235 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41236 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41237 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41238 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41239 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41240 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
41241 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
41242 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
41243 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
41244 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
41245 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
41246 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41247 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
41248 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41249 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
41250 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41251 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
41252 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
41253 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
41254 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
41255 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41256 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41257 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41258 ; GFX7-NEXT: s_setpc_b64 s[30:31]
41260 ; GFX8-LABEL: v_fmuladd_v3bf16:
41262 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41263 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41264 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41265 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
41266 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41267 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41268 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
41269 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
41270 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41271 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
41272 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41273 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41274 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
41275 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41276 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
41277 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41278 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
41279 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
41280 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41281 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41282 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41283 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41284 ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
41285 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41286 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41287 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41288 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
41289 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41290 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41291 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41292 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41293 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
41294 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41295 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41296 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41297 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41298 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41299 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
41300 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
41301 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41302 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41303 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41304 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41305 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
41306 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
41307 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41308 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41309 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41310 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41311 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
41312 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41313 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41314 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
41315 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
41316 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41317 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41318 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
41319 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
41320 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
41321 ; GFX8-NEXT: s_setpc_b64 s[30:31]
41323 ; GFX9-LABEL: v_fmuladd_v3bf16:
41325 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41326 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41327 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41328 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
41329 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41330 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
41331 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41332 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
41333 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41334 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
41335 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41336 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41337 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
41338 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41339 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41340 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
41341 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41342 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41343 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41344 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41345 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
41346 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41347 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41348 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
41349 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41350 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41351 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41352 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41353 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
41354 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41355 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41356 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41357 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
41358 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41359 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
41360 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41361 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41362 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
41363 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41364 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
41365 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41366 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41367 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41368 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41369 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
41370 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41371 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41372 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
41373 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41374 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41375 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
41376 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
41377 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
41378 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41380 ; GFX10-LABEL: v_fmuladd_v3bf16:
41382 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41383 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
41384 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41385 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
41386 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
41387 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41388 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41389 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
41390 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
41391 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
41392 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
41393 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
41394 ; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
41395 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41396 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
41397 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41398 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
41399 ; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
41400 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0
41401 ; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
41402 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41403 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41404 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41405 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41406 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41407 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41408 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41409 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41410 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
41411 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41412 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41413 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
41414 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
41415 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41416 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
41417 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
41418 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
41419 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41420 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
41421 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
41422 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
41423 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
41424 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
41425 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41426 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
41427 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41428 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41429 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41430 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41431 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41432 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
41433 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41435 ; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
41436 ; GFX11TRUE16: ; %bb.0:
41437 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41438 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
41439 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
41440 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41441 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41442 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41443 ; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
41444 ; GFX11TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
41445 ; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
41446 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41447 ; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
41448 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41449 ; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v3
41450 ; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v7, v6
41451 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41452 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
41453 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41454 ; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
41455 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41456 ; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
41457 ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41458 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
41459 ; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
41460 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41461 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41462 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41463 ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41464 ; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41465 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41466 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41467 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41468 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41469 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41470 ; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
41471 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41472 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41473 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41474 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
41475 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41476 ; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
41477 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
41478 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41479 ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
41480 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
41481 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
41482 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
41483 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
41484 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41485 ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
41486 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
41487 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41488 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41489 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41490 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41491 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41492 ; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41493 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41494 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
41495 ; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
41496 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
41498 ; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
41499 ; GFX11FAKE16: ; %bb.0:
41500 ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41501 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
41502 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
41503 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41504 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41505 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41506 ; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
41507 ; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
41508 ; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
41509 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41510 ; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
41511 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
41512 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, v1, v3
41513 ; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v7, v6
41514 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41515 ; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
41516 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41517 ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
41518 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41519 ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
41520 ; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41521 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
41522 ; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
41523 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41524 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41525 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
41526 ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41527 ; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41528 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
41529 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41530 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41531 ; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41532 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
41533 ; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v5
41534 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41535 ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41536 ; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41537 ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
41538 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41539 ; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
41540 ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
41541 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41542 ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
41543 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
41544 ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
41545 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
41546 ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
41547 ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
41548 ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
41549 ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
41550 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
41551 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41552 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41553 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
41554 ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41555 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41556 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
41557 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
41558 ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
41559 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
41560 %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
41561 ret <3 x bfloat> %op
41564 define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
41565 ; GCN-LABEL: v_fmuladd_v4bf16:
41567 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41568 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
41569 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
41570 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
41571 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
41572 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
41573 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
41574 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
41575 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
41576 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
41577 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
41578 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
41579 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
41580 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
41581 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41582 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
41583 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41584 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41585 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
41586 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41587 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41588 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
41589 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41590 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41591 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
41592 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
41593 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
41594 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
41595 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
41596 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41597 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41598 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41599 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41600 ; GCN-NEXT: v_add_f32_e32 v3, v3, v11
41601 ; GCN-NEXT: v_add_f32_e32 v2, v2, v10
41602 ; GCN-NEXT: v_add_f32_e32 v1, v1, v9
41603 ; GCN-NEXT: v_add_f32_e32 v0, v0, v8
41604 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41605 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41606 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41607 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41608 ; GCN-NEXT: s_setpc_b64 s[30:31]
41610 ; GFX7-LABEL: v_fmuladd_v4bf16:
41612 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41613 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
41614 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
41615 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
41616 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
41617 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
41618 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
41619 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
41620 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
41621 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
41622 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41623 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41624 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41625 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41626 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41627 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41628 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41629 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
41630 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
41631 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
41632 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
41633 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
41634 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
41635 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
41636 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
41637 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41638 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v11
41639 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41640 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
41641 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41642 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
41643 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41644 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
41645 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
41646 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
41647 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
41648 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
41649 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41650 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41651 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41652 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41653 ; GFX7-NEXT: s_setpc_b64 s[30:31]
41655 ; GFX8-LABEL: v_fmuladd_v4bf16:
41657 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41658 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41659 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41660 ; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6
41661 ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
41662 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
41663 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
41664 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
41665 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41666 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41667 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41668 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
41669 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
41670 ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
41671 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
41672 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
41673 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41674 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41675 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
41676 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
41677 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
41678 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41679 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41680 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41681 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41682 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
41683 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
41684 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41685 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
41686 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41687 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
41688 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
41689 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
41690 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
41691 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
41692 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
41693 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41694 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41695 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41696 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41697 ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
41698 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41699 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41700 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41701 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
41702 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41703 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41704 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41705 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41706 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
41707 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
41708 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
41709 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41710 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41711 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
41712 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
41713 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
41714 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41715 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41716 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41717 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41718 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
41719 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
41720 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41721 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41722 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41723 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41724 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
41725 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
41726 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
41727 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
41728 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
41729 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41730 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41731 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
41732 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
41733 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
41734 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
41735 ; GFX8-NEXT: s_setpc_b64 s[30:31]
41737 ; GFX9-LABEL: v_fmuladd_v4bf16:
41739 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41740 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41741 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41742 ; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
41743 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
41744 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
41745 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
41746 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
41747 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41748 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41749 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
41750 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
41751 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
41752 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41753 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41754 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
41755 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
41756 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
41757 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
41758 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
41759 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41760 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
41761 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41762 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
41763 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41764 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
41765 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41766 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
41767 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
41768 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
41769 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
41770 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
41771 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
41772 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
41773 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
41774 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
41775 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
41776 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41777 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41778 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
41779 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41780 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41781 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41782 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
41783 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
41784 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41785 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41786 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
41787 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
41788 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
41789 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
41790 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
41791 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41792 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
41793 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41794 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
41795 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41796 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
41797 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41798 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
41799 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
41800 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
41801 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
41802 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
41803 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
41804 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
41805 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
41806 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
41807 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
41808 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41810 ; GFX10-LABEL: v_fmuladd_v4bf16:
41812 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41813 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41814 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41815 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41816 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41817 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
41818 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41819 ; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
41820 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
41821 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41822 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
41823 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
41824 ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
41825 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
41826 ; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7
41827 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
41828 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
41829 ; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
41830 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
41831 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
41832 ; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1
41833 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41834 ; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1
41835 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo
41836 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41837 ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7
41838 ; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
41839 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
41840 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41841 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
41842 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
41843 ; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
41844 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
41845 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v8
41846 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41847 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
41848 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41849 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
41850 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41851 ; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
41852 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
41853 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
41854 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
41855 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
41856 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41857 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
41858 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41859 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
41860 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
41861 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
41862 ; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
41863 ; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
41864 ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
41865 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
41866 ; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
41867 ; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
41868 ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
41869 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41870 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
41871 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
41872 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
41873 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41874 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
41875 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41876 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41877 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
41878 ; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
41879 ; GFX10-NEXT: s_setpc_b64 s[30:31]
41881 ; GFX11-LABEL: v_fmuladd_v4bf16:
41883 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41884 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0
41885 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41886 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1
41887 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
41888 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
41889 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3
41890 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41891 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41892 ; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
41893 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2
41894 ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2
41895 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
41896 ; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
41897 ; GFX11-NEXT: v_mul_f32_e32 v7, v9, v7
41898 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v6
41899 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
41900 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41901 ; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
41902 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
41903 ; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
41904 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
41905 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
41906 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41907 ; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v7
41908 ; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
41909 ; GFX11-NEXT: v_bfe_u32 v11, v0, 16, 1
41910 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
41911 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0
41912 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
41913 ; GFX11-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
41914 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
41915 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
41916 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
41917 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
41918 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
41919 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41920 ; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
41921 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
41922 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
41923 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
41924 ; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
41925 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41926 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
41927 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
41928 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
41929 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
41930 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
41931 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v8
41932 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1
41933 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
41934 ; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
41935 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
41936 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
41937 ; GFX11-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
41938 ; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
41939 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41940 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
41941 ; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
41942 ; GFX11-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
41943 ; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
41944 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
41945 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
41946 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
41947 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
41948 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
41949 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
41950 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
41951 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
41952 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
41953 ; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
41954 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
41955 ; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
41956 ; GFX11-NEXT: s_setpc_b64 s[30:31]
41957 %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
41958 ret <4 x bfloat> %op