1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
4 ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 declare half @llvm.fabs.f16(half) #0
8 declare half @llvm.canonicalize.f16(half) #0
9 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
10 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
11 declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
12 declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
13 declare <6 x half> @llvm.canonicalize.v6f16(<6 x half>) #0
14 declare <8 x half> @llvm.canonicalize.v8f16(<8 x half>) #0
15 declare <12 x half> @llvm.canonicalize.v12f16(<12 x half>) #0
16 declare <16 x half> @llvm.canonicalize.v16f16(<16 x half>) #0
17 declare <32 x half> @llvm.canonicalize.v32f16(<32 x half>) #0
18 declare <64 x half> @llvm.canonicalize.v64f16(<64 x half>) #0
19 declare i32 @llvm.amdgcn.workitem.id.x() #0
21 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 {
22 ; VI-LABEL: test_fold_canonicalize_undef_value_f16:
24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
25 ; VI-NEXT: v_mov_b32_e32 v2, 0
26 ; VI-NEXT: s_waitcnt lgkmcnt(0)
27 ; VI-NEXT: v_mov_b32_e32 v0, s0
28 ; VI-NEXT: v_mov_b32_e32 v1, s1
29 ; VI-NEXT: flat_store_short v[0:1], v2
32 ; GFX9-LABEL: test_fold_canonicalize_undef_value_f16:
34 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
35 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
36 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX9-NEXT: global_store_short v0, v0, s[0:1]
40 ; CI-LABEL: test_fold_canonicalize_undef_value_f16:
42 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
43 ; CI-NEXT: s_mov_b32 s3, 0xf000
44 ; CI-NEXT: s_mov_b32 s2, -1
45 ; CI-NEXT: v_mov_b32_e32 v0, 0
46 ; CI-NEXT: s_waitcnt lgkmcnt(0)
47 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
50 ; GFX11-LABEL: test_fold_canonicalize_undef_value_f16:
52 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
53 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
54 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
55 ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
57 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
58 ; GFX11-NEXT: s_endpgm
59 %canonicalized = call half @llvm.canonicalize.f16(half undef)
60 store half %canonicalized, ptr addrspace(1) %out
64 define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 {
65 ; VI-LABEL: v_test_canonicalize_var_f16:
67 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
68 ; VI-NEXT: s_waitcnt lgkmcnt(0)
69 ; VI-NEXT: v_mov_b32_e32 v0, s0
70 ; VI-NEXT: v_mov_b32_e32 v1, s1
71 ; VI-NEXT: flat_load_ushort v0, v[0:1]
72 ; VI-NEXT: s_waitcnt vmcnt(0)
73 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
74 ; VI-NEXT: flat_store_short v[0:1], v0
77 ; GFX9-LABEL: v_test_canonicalize_var_f16:
79 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
80 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
81 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
83 ; GFX9-NEXT: s_waitcnt vmcnt(0)
84 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
85 ; GFX9-NEXT: global_store_short v[0:1], v0, off
88 ; CI-LABEL: v_test_canonicalize_var_f16:
90 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
91 ; CI-NEXT: s_mov_b32 s3, 0xf000
92 ; CI-NEXT: s_mov_b32 s2, -1
93 ; CI-NEXT: s_waitcnt lgkmcnt(0)
94 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
95 ; CI-NEXT: s_waitcnt vmcnt(0)
96 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
97 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
98 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
99 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
102 ; GFX11-LABEL: v_test_canonicalize_var_f16:
104 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
105 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
106 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
108 ; GFX11-NEXT: s_waitcnt vmcnt(0)
109 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
110 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off
111 ; GFX11-NEXT: s_nop 0
112 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
113 ; GFX11-NEXT: s_endpgm
114 %val = load half, ptr addrspace(1) %out
115 %canonicalized = call half @llvm.canonicalize.f16(half %val)
116 store half %canonicalized, ptr addrspace(1) undef
120 define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
121 ; VI-LABEL: s_test_canonicalize_var_f16:
123 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
124 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
125 ; VI-NEXT: s_waitcnt lgkmcnt(0)
126 ; VI-NEXT: v_max_f16_e64 v2, s2, s2
127 ; VI-NEXT: v_mov_b32_e32 v0, s0
128 ; VI-NEXT: v_mov_b32_e32 v1, s1
129 ; VI-NEXT: flat_store_short v[0:1], v2
132 ; GFX9-LABEL: s_test_canonicalize_var_f16:
134 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
135 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
136 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
137 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX9-NEXT: v_max_f16_e64 v1, s4, s4
139 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
140 ; GFX9-NEXT: s_endpgm
142 ; CI-LABEL: s_test_canonicalize_var_f16:
144 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb
145 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
146 ; CI-NEXT: s_mov_b32 s3, 0xf000
147 ; CI-NEXT: s_waitcnt lgkmcnt(0)
148 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
149 ; CI-NEXT: s_mov_b32 s2, -1
150 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
151 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
152 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
155 ; GFX11-LABEL: s_test_canonicalize_var_f16:
157 ; GFX11-NEXT: s_clause 0x1
158 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
159 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
160 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
161 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX11-NEXT: v_max_f16_e64 v1, s2, s2
163 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
164 ; GFX11-NEXT: s_nop 0
165 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
166 ; GFX11-NEXT: s_endpgm
167 %val = bitcast i16 %val.arg to half
168 %canonicalized = call half @llvm.canonicalize.f16(half %val)
169 store half %canonicalized, ptr addrspace(1) %out
173 define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
174 ; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
176 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
178 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
179 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
180 ; VI-NEXT: s_setpc_b64 s[30:31]
182 ; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16:
184 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
186 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
187 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
188 ; GFX9-NEXT: s_setpc_b64 s[30:31]
190 ; CI-LABEL: v_test_canonicalize_build_vector_v2f16:
192 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
194 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
195 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
196 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
197 ; CI-NEXT: s_setpc_b64 s[30:31]
199 ; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
201 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
203 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
204 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
205 ; GFX11-NEXT: s_setpc_b64 s[30:31]
206 %ins0 = insertelement <2 x half> undef, half %lo, i32 0
207 %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
208 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
209 ret <2 x half> %canonicalized
212 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 {
213 ; VI-LABEL: v_test_canonicalize_fabs_var_f16:
215 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
216 ; VI-NEXT: s_waitcnt lgkmcnt(0)
217 ; VI-NEXT: v_mov_b32_e32 v0, s0
218 ; VI-NEXT: v_mov_b32_e32 v1, s1
219 ; VI-NEXT: flat_load_ushort v2, v[0:1]
220 ; VI-NEXT: s_waitcnt vmcnt(0)
221 ; VI-NEXT: v_max_f16_e64 v2, |v2|, |v2|
222 ; VI-NEXT: flat_store_short v[0:1], v2
225 ; GFX9-LABEL: v_test_canonicalize_fabs_var_f16:
227 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
228 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
229 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
230 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
231 ; GFX9-NEXT: s_waitcnt vmcnt(0)
232 ; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1|
233 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
234 ; GFX9-NEXT: s_endpgm
236 ; CI-LABEL: v_test_canonicalize_fabs_var_f16:
238 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
239 ; CI-NEXT: s_mov_b32 s3, 0xf000
240 ; CI-NEXT: s_mov_b32 s2, -1
241 ; CI-NEXT: s_waitcnt lgkmcnt(0)
242 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
243 ; CI-NEXT: s_waitcnt vmcnt(0)
244 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
245 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
246 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
247 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
250 ; GFX11-LABEL: v_test_canonicalize_fabs_var_f16:
252 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
253 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
254 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
256 ; GFX11-NEXT: s_waitcnt vmcnt(0)
257 ; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1|
258 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
259 ; GFX11-NEXT: s_nop 0
260 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
261 ; GFX11-NEXT: s_endpgm
262 %val = load half, ptr addrspace(1) %out
263 %val.fabs = call half @llvm.fabs.f16(half %val)
264 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
265 store half %canonicalized, ptr addrspace(1) %out
269 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 {
270 ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
272 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
273 ; VI-NEXT: s_waitcnt lgkmcnt(0)
274 ; VI-NEXT: v_mov_b32_e32 v0, s0
275 ; VI-NEXT: v_mov_b32_e32 v1, s1
276 ; VI-NEXT: flat_load_ushort v2, v[0:1]
277 ; VI-NEXT: s_waitcnt vmcnt(0)
278 ; VI-NEXT: v_max_f16_e64 v2, -|v2|, -|v2|
279 ; VI-NEXT: flat_store_short v[0:1], v2
282 ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
284 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
285 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
286 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
287 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
288 ; GFX9-NEXT: s_waitcnt vmcnt(0)
289 ; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
290 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
291 ; GFX9-NEXT: s_endpgm
293 ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
295 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
296 ; CI-NEXT: s_mov_b32 s3, 0xf000
297 ; CI-NEXT: s_mov_b32 s2, -1
298 ; CI-NEXT: s_waitcnt lgkmcnt(0)
299 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
300 ; CI-NEXT: s_waitcnt vmcnt(0)
301 ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
302 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
303 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
304 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
307 ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
309 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
310 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
311 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
312 ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
313 ; GFX11-NEXT: s_waitcnt vmcnt(0)
314 ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
315 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
316 ; GFX11-NEXT: s_nop 0
317 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
318 ; GFX11-NEXT: s_endpgm
319 %val = load half, ptr addrspace(1) %out
320 %val.fabs = call half @llvm.fabs.f16(half %val)
321 %val.fabs.fneg = fneg half %val.fabs
322 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
323 store half %canonicalized, ptr addrspace(1) %out
327 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 {
328 ; VI-LABEL: v_test_canonicalize_fneg_var_f16:
330 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
331 ; VI-NEXT: s_waitcnt lgkmcnt(0)
332 ; VI-NEXT: v_mov_b32_e32 v0, s0
333 ; VI-NEXT: v_mov_b32_e32 v1, s1
334 ; VI-NEXT: flat_load_ushort v2, v[0:1]
335 ; VI-NEXT: s_waitcnt vmcnt(0)
336 ; VI-NEXT: v_max_f16_e64 v2, -v2, -v2
337 ; VI-NEXT: flat_store_short v[0:1], v2
340 ; GFX9-LABEL: v_test_canonicalize_fneg_var_f16:
342 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
343 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
344 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
345 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
346 ; GFX9-NEXT: s_waitcnt vmcnt(0)
347 ; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
348 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
349 ; GFX9-NEXT: s_endpgm
351 ; CI-LABEL: v_test_canonicalize_fneg_var_f16:
353 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
354 ; CI-NEXT: s_mov_b32 s3, 0xf000
355 ; CI-NEXT: s_mov_b32 s2, -1
356 ; CI-NEXT: s_waitcnt lgkmcnt(0)
357 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
358 ; CI-NEXT: s_waitcnt vmcnt(0)
359 ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
360 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
361 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
362 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
365 ; GFX11-LABEL: v_test_canonicalize_fneg_var_f16:
367 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
368 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
369 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
371 ; GFX11-NEXT: s_waitcnt vmcnt(0)
372 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
373 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
374 ; GFX11-NEXT: s_nop 0
375 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
376 ; GFX11-NEXT: s_endpgm
377 %val = load half, ptr addrspace(1) %out
378 %val.fneg = fneg half %val
379 %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
380 store half %canonicalized, ptr addrspace(1) %out
384 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 {
385 ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
387 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
388 ; VI-NEXT: s_waitcnt lgkmcnt(0)
389 ; VI-NEXT: v_mov_b32_e32 v0, s0
390 ; VI-NEXT: v_mov_b32_e32 v1, s1
391 ; VI-NEXT: flat_load_ushort v2, v[0:1]
392 ; VI-NEXT: s_waitcnt vmcnt(0)
393 ; VI-NEXT: v_mul_f16_e32 v2, -1.0, v2
394 ; VI-NEXT: flat_store_short v[0:1], v2
397 ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
399 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
401 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
403 ; GFX9-NEXT: s_waitcnt vmcnt(0)
404 ; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
405 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
406 ; GFX9-NEXT: s_endpgm
408 ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
410 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
411 ; CI-NEXT: s_mov_b32 s3, 0xf000
412 ; CI-NEXT: s_mov_b32 s2, -1
413 ; CI-NEXT: s_waitcnt lgkmcnt(0)
414 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
415 ; CI-NEXT: s_waitcnt vmcnt(0)
416 ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
417 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
418 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
419 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
422 ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
424 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
425 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
426 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
427 ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
428 ; GFX11-NEXT: s_waitcnt vmcnt(0)
429 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
430 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
431 ; GFX11-NEXT: s_nop 0
432 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
433 ; GFX11-NEXT: s_endpgm
434 %val = load half, ptr addrspace(1) %out
435 %val.fneg = fneg half %val
436 %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
437 store half %canonicalized, ptr addrspace(1) %out
441 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 {
442 ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
444 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
445 ; VI-NEXT: s_waitcnt lgkmcnt(0)
446 ; VI-NEXT: v_mov_b32_e32 v0, s0
447 ; VI-NEXT: v_mov_b32_e32 v1, s1
448 ; VI-NEXT: flat_load_ushort v2, v[0:1]
449 ; VI-NEXT: s_waitcnt vmcnt(0)
450 ; VI-NEXT: v_mul_f16_e64 v2, -1.0, |v2|
451 ; VI-NEXT: flat_store_short v[0:1], v2
454 ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
456 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
457 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
458 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
460 ; GFX9-NEXT: s_waitcnt vmcnt(0)
461 ; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
462 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
463 ; GFX9-NEXT: s_endpgm
465 ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
467 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
468 ; CI-NEXT: s_mov_b32 s3, 0xf000
469 ; CI-NEXT: s_mov_b32 s2, -1
470 ; CI-NEXT: s_waitcnt lgkmcnt(0)
471 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
472 ; CI-NEXT: s_waitcnt vmcnt(0)
473 ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
474 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
475 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
476 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
479 ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
481 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
482 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
483 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
484 ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
485 ; GFX11-NEXT: s_waitcnt vmcnt(0)
486 ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
487 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
488 ; GFX11-NEXT: s_nop 0
489 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
490 ; GFX11-NEXT: s_endpgm
491 %val = load half, ptr addrspace(1) %out
492 %val.fabs = call half @llvm.fabs.f16(half %val)
493 %val.fabs.fneg = fneg half %val.fabs
494 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
495 store half %canonicalized, ptr addrspace(1) %out
499 define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 {
500 ; VI-LABEL: test_fold_canonicalize_p0_f16:
502 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
503 ; VI-NEXT: v_mov_b32_e32 v2, 0
504 ; VI-NEXT: s_waitcnt lgkmcnt(0)
505 ; VI-NEXT: v_mov_b32_e32 v0, s0
506 ; VI-NEXT: v_mov_b32_e32 v1, s1
507 ; VI-NEXT: flat_store_short v[0:1], v2
510 ; GFX9-LABEL: test_fold_canonicalize_p0_f16:
512 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
513 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
514 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX9-NEXT: global_store_short v0, v0, s[0:1]
516 ; GFX9-NEXT: s_endpgm
518 ; CI-LABEL: test_fold_canonicalize_p0_f16:
520 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
521 ; CI-NEXT: s_mov_b32 s3, 0xf000
522 ; CI-NEXT: s_mov_b32 s2, -1
523 ; CI-NEXT: v_mov_b32_e32 v0, 0
524 ; CI-NEXT: s_waitcnt lgkmcnt(0)
525 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
528 ; GFX11-LABEL: test_fold_canonicalize_p0_f16:
530 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
531 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
533 ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
534 ; GFX11-NEXT: s_nop 0
535 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
536 ; GFX11-NEXT: s_endpgm
537 %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
538 store half %canonicalized, ptr addrspace(1) %out
542 define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 {
543 ; VI-LABEL: test_fold_canonicalize_n0_f16:
545 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
546 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000
547 ; VI-NEXT: s_waitcnt lgkmcnt(0)
548 ; VI-NEXT: v_mov_b32_e32 v0, s0
549 ; VI-NEXT: v_mov_b32_e32 v1, s1
550 ; VI-NEXT: flat_store_short v[0:1], v2
553 ; GFX9-LABEL: test_fold_canonicalize_n0_f16:
555 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
556 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
557 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
558 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
559 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
560 ; GFX9-NEXT: s_endpgm
562 ; CI-LABEL: test_fold_canonicalize_n0_f16:
564 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
565 ; CI-NEXT: s_mov_b32 s3, 0xf000
566 ; CI-NEXT: s_mov_b32 s2, -1
567 ; CI-NEXT: v_mov_b32_e32 v0, 0x8000
568 ; CI-NEXT: s_waitcnt lgkmcnt(0)
569 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
572 ; GFX11-LABEL: test_fold_canonicalize_n0_f16:
574 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
575 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
576 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
578 ; GFX11-NEXT: s_nop 0
579 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
580 ; GFX11-NEXT: s_endpgm
581 %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
582 store half %canonicalized, ptr addrspace(1) %out
586 define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 {
587 ; VI-LABEL: test_fold_canonicalize_p1_f16:
589 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
590 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
591 ; VI-NEXT: s_waitcnt lgkmcnt(0)
592 ; VI-NEXT: v_mov_b32_e32 v0, s0
593 ; VI-NEXT: v_mov_b32_e32 v1, s1
594 ; VI-NEXT: flat_store_short v[0:1], v2
597 ; GFX9-LABEL: test_fold_canonicalize_p1_f16:
599 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
600 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
601 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00
602 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
603 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
604 ; GFX9-NEXT: s_endpgm
606 ; CI-LABEL: test_fold_canonicalize_p1_f16:
608 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
609 ; CI-NEXT: s_mov_b32 s3, 0xf000
610 ; CI-NEXT: s_mov_b32 s2, -1
611 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c00
612 ; CI-NEXT: s_waitcnt lgkmcnt(0)
613 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
616 ; GFX11-LABEL: test_fold_canonicalize_p1_f16:
618 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
619 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
620 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
621 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
622 ; GFX11-NEXT: s_nop 0
623 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
624 ; GFX11-NEXT: s_endpgm
625 %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
626 store half %canonicalized, ptr addrspace(1) %out
630 define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 {
631 ; VI-LABEL: test_fold_canonicalize_n1_f16:
633 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
634 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00
635 ; VI-NEXT: s_waitcnt lgkmcnt(0)
636 ; VI-NEXT: v_mov_b32_e32 v0, s0
637 ; VI-NEXT: v_mov_b32_e32 v1, s1
638 ; VI-NEXT: flat_store_short v[0:1], v2
641 ; GFX9-LABEL: test_fold_canonicalize_n1_f16:
643 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
644 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
645 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00
646 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
648 ; GFX9-NEXT: s_endpgm
650 ; CI-LABEL: test_fold_canonicalize_n1_f16:
652 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
653 ; CI-NEXT: s_mov_b32 s3, 0xf000
654 ; CI-NEXT: s_mov_b32 s2, -1
655 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00
656 ; CI-NEXT: s_waitcnt lgkmcnt(0)
657 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
660 ; GFX11-LABEL: test_fold_canonicalize_n1_f16:
662 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
663 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
664 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
665 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
666 ; GFX11-NEXT: s_nop 0
667 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
668 ; GFX11-NEXT: s_endpgm
669 %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
670 store half %canonicalized, ptr addrspace(1) %out
674 define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 {
675 ; VI-LABEL: test_fold_canonicalize_literal_f16:
677 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
678 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c00
679 ; VI-NEXT: s_waitcnt lgkmcnt(0)
680 ; VI-NEXT: v_mov_b32_e32 v0, s0
681 ; VI-NEXT: v_mov_b32_e32 v1, s1
682 ; VI-NEXT: flat_store_short v[0:1], v2
685 ; GFX9-LABEL: test_fold_canonicalize_literal_f16:
687 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
688 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
689 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00
690 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
691 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
692 ; GFX9-NEXT: s_endpgm
694 ; CI-LABEL: test_fold_canonicalize_literal_f16:
696 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
697 ; CI-NEXT: s_mov_b32 s3, 0xf000
698 ; CI-NEXT: s_mov_b32 s2, -1
699 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c00
700 ; CI-NEXT: s_waitcnt lgkmcnt(0)
701 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
704 ; GFX11-LABEL: test_fold_canonicalize_literal_f16:
706 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
707 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
708 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
710 ; GFX11-NEXT: s_nop 0
711 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
712 ; GFX11-NEXT: s_endpgm
713 %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
714 store half %canonicalized, ptr addrspace(1) %out
718 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 {
719 ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
721 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
722 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
723 ; VI-NEXT: s_waitcnt lgkmcnt(0)
724 ; VI-NEXT: v_mov_b32_e32 v0, s0
725 ; VI-NEXT: v_mov_b32_e32 v1, s1
726 ; VI-NEXT: flat_store_short v[0:1], v2
729 ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
731 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
732 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
733 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
734 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
735 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
736 ; GFX9-NEXT: s_endpgm
738 ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
740 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
741 ; CI-NEXT: s_mov_b32 s3, 0xf000
742 ; CI-NEXT: s_mov_b32 s2, -1
743 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff
744 ; CI-NEXT: s_waitcnt lgkmcnt(0)
745 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
748 ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
750 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
751 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
752 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
754 ; GFX11-NEXT: s_nop 0
755 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
756 ; GFX11-NEXT: s_endpgm
757 %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
758 store half %canonicalized, ptr addrspace(1) %out
762 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 {
763 ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
765 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
766 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
767 ; VI-NEXT: s_waitcnt lgkmcnt(0)
768 ; VI-NEXT: v_mov_b32_e32 v0, s0
769 ; VI-NEXT: v_mov_b32_e32 v1, s1
770 ; VI-NEXT: flat_store_short v[0:1], v2
773 ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
775 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
776 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
777 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
778 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
779 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
780 ; GFX9-NEXT: s_endpgm
782 ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
784 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
785 ; CI-NEXT: s_mov_b32 s3, 0xf000
786 ; CI-NEXT: s_mov_b32 s2, -1
787 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff
788 ; CI-NEXT: s_waitcnt lgkmcnt(0)
789 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
792 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
794 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
795 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
796 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
797 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
798 ; GFX11-NEXT: s_nop 0
799 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
800 ; GFX11-NEXT: s_endpgm
801 %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
802 store half %canonicalized, ptr addrspace(1) %out
806 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 {
807 ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
809 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
810 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
811 ; VI-NEXT: s_waitcnt lgkmcnt(0)
812 ; VI-NEXT: v_mov_b32_e32 v0, s0
813 ; VI-NEXT: v_mov_b32_e32 v1, s1
814 ; VI-NEXT: flat_store_short v[0:1], v2
817 ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
819 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
820 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
821 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
822 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
823 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
824 ; GFX9-NEXT: s_endpgm
826 ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
828 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
829 ; CI-NEXT: s_mov_b32 s3, 0xf000
830 ; CI-NEXT: s_mov_b32 s2, -1
831 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff
832 ; CI-NEXT: s_waitcnt lgkmcnt(0)
833 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
836 ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
838 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
839 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
840 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
841 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
842 ; GFX11-NEXT: s_nop 0
843 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
844 ; GFX11-NEXT: s_endpgm
845 %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
846 store half %canonicalized, ptr addrspace(1) %out
850 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 {
851 ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
853 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
854 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
855 ; VI-NEXT: s_waitcnt lgkmcnt(0)
856 ; VI-NEXT: v_mov_b32_e32 v0, s0
857 ; VI-NEXT: v_mov_b32_e32 v1, s1
858 ; VI-NEXT: flat_store_short v[0:1], v2
861 ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
863 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
864 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
865 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
866 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
867 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
868 ; GFX9-NEXT: s_endpgm
870 ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
872 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
873 ; CI-NEXT: s_mov_b32 s3, 0xf000
874 ; CI-NEXT: s_mov_b32 s2, -1
875 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff
876 ; CI-NEXT: s_waitcnt lgkmcnt(0)
877 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
880 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
882 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
883 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
884 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
885 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
886 ; GFX11-NEXT: s_nop 0
887 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
888 ; GFX11-NEXT: s_endpgm
889 %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
890 store half %canonicalized, ptr addrspace(1) %out
894 define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 {
895 ; VI-LABEL: test_fold_canonicalize_qnan_f16:
897 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
898 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c00
899 ; VI-NEXT: s_waitcnt lgkmcnt(0)
900 ; VI-NEXT: v_mov_b32_e32 v0, s0
901 ; VI-NEXT: v_mov_b32_e32 v1, s1
902 ; VI-NEXT: flat_store_short v[0:1], v2
905 ; GFX9-LABEL: test_fold_canonicalize_qnan_f16:
907 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
908 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
909 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00
910 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
911 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
912 ; GFX9-NEXT: s_endpgm
914 ; CI-LABEL: test_fold_canonicalize_qnan_f16:
916 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
917 ; CI-NEXT: s_mov_b32 s3, 0xf000
918 ; CI-NEXT: s_mov_b32 s2, -1
919 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c00
920 ; CI-NEXT: s_waitcnt lgkmcnt(0)
921 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
924 ; GFX11-LABEL: test_fold_canonicalize_qnan_f16:
926 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
927 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
928 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
929 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
930 ; GFX11-NEXT: s_nop 0
931 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
932 ; GFX11-NEXT: s_endpgm
933 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
934 store half %canonicalized, ptr addrspace(1) %out
938 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 {
939 ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
941 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
942 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
943 ; VI-NEXT: s_waitcnt lgkmcnt(0)
944 ; VI-NEXT: v_mov_b32_e32 v0, s0
945 ; VI-NEXT: v_mov_b32_e32 v1, s1
946 ; VI-NEXT: flat_store_short v[0:1], v2
949 ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
951 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
952 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
953 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
954 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
955 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
956 ; GFX9-NEXT: s_endpgm
958 ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
960 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
961 ; CI-NEXT: s_mov_b32 s3, 0xf000
962 ; CI-NEXT: s_mov_b32 s2, -1
963 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
964 ; CI-NEXT: s_waitcnt lgkmcnt(0)
965 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
968 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
970 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
971 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
972 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
973 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
974 ; GFX11-NEXT: s_nop 0
975 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
976 ; GFX11-NEXT: s_endpgm
977 %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
978 store half %canonicalized, ptr addrspace(1) %out
982 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 {
983 ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
985 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
986 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
987 ; VI-NEXT: s_waitcnt lgkmcnt(0)
988 ; VI-NEXT: v_mov_b32_e32 v0, s0
989 ; VI-NEXT: v_mov_b32_e32 v1, s1
990 ; VI-NEXT: flat_store_short v[0:1], v2
993 ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
995 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
996 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
997 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
998 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
999 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1000 ; GFX9-NEXT: s_endpgm
1002 ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
1004 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1005 ; CI-NEXT: s_mov_b32 s3, 0xf000
1006 ; CI-NEXT: s_mov_b32 s2, -1
1007 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
1008 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1009 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
1012 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
1014 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1015 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1016 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1017 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1018 ; GFX11-NEXT: s_nop 0
1019 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1020 ; GFX11-NEXT: s_endpgm
1021 %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
1022 store half %canonicalized, ptr addrspace(1) %out
1026 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 {
1027 ; VI-LABEL: test_fold_canonicalize_snan0_value_f16:
1029 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1030 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
1031 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1032 ; VI-NEXT: v_mov_b32_e32 v0, s0
1033 ; VI-NEXT: v_mov_b32_e32 v1, s1
1034 ; VI-NEXT: flat_store_short v[0:1], v2
1037 ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16:
1039 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1040 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1041 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
1042 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1043 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1044 ; GFX9-NEXT: s_endpgm
1046 ; CI-LABEL: test_fold_canonicalize_snan0_value_f16:
1048 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1049 ; CI-NEXT: s_mov_b32 s3, 0xf000
1050 ; CI-NEXT: s_mov_b32 s2, -1
1051 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
1052 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1053 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
1056 ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16:
1058 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1059 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1060 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1061 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1062 ; GFX11-NEXT: s_nop 0
1063 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1064 ; GFX11-NEXT: s_endpgm
1065 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
1066 store half %canonicalized, ptr addrspace(1) %out
1070 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 {
1071 ; VI-LABEL: test_fold_canonicalize_snan1_value_f16:
1073 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1074 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
1075 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1076 ; VI-NEXT: v_mov_b32_e32 v0, s0
1077 ; VI-NEXT: v_mov_b32_e32 v1, s1
1078 ; VI-NEXT: flat_store_short v[0:1], v2
1081 ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16:
1083 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1084 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1085 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
1086 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1087 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1088 ; GFX9-NEXT: s_endpgm
1090 ; CI-LABEL: test_fold_canonicalize_snan1_value_f16:
1092 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1093 ; CI-NEXT: s_mov_b32 s3, 0xf000
1094 ; CI-NEXT: s_mov_b32 s2, -1
1095 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
1096 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1097 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
1100 ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16:
1102 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1103 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1104 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1105 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1106 ; GFX11-NEXT: s_nop 0
1107 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1108 ; GFX11-NEXT: s_endpgm
1109 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
1110 store half %canonicalized, ptr addrspace(1) %out
1114 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 {
1115 ; VI-LABEL: test_fold_canonicalize_snan2_value_f16:
1117 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1118 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
1119 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1120 ; VI-NEXT: v_mov_b32_e32 v0, s0
1121 ; VI-NEXT: v_mov_b32_e32 v1, s1
1122 ; VI-NEXT: flat_store_short v[0:1], v2
1125 ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16:
1127 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1128 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1129 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
1130 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1131 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1132 ; GFX9-NEXT: s_endpgm
1134 ; CI-LABEL: test_fold_canonicalize_snan2_value_f16:
1136 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1137 ; CI-NEXT: s_mov_b32 s3, 0xf000
1138 ; CI-NEXT: s_mov_b32 s2, -1
1139 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
1140 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1141 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
1144 ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16:
1146 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1147 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1148 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1149 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1150 ; GFX11-NEXT: s_nop 0
1151 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1152 ; GFX11-NEXT: s_endpgm
1153 %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
1154 store half %canonicalized, ptr addrspace(1) %out
1158 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 {
1159 ; VI-LABEL: test_fold_canonicalize_snan3_value_f16:
1161 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1162 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
1163 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1164 ; VI-NEXT: v_mov_b32_e32 v0, s0
1165 ; VI-NEXT: v_mov_b32_e32 v1, s1
1166 ; VI-NEXT: flat_store_short v[0:1], v2
1169 ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16:
1171 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1172 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1173 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
1174 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1175 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1176 ; GFX9-NEXT: s_endpgm
1178 ; CI-LABEL: test_fold_canonicalize_snan3_value_f16:
1180 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1181 ; CI-NEXT: s_mov_b32 s3, 0xf000
1182 ; CI-NEXT: s_mov_b32 s2, -1
1183 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
1184 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1185 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
1188 ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16:
1190 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1191 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1192 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1193 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1194 ; GFX11-NEXT: s_nop 0
1195 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1196 ; GFX11-NEXT: s_endpgm
1197 %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
1198 store half %canonicalized, ptr addrspace(1) %out
1202 define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 {
1203 ; VI-LABEL: v_test_canonicalize_var_v2f16:
1205 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1206 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1207 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1208 ; VI-NEXT: v_mov_b32_e32 v1, s1
1209 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1210 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1211 ; VI-NEXT: flat_load_dword v0, v[0:1]
1212 ; VI-NEXT: s_waitcnt vmcnt(0)
1213 ; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1214 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
1215 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
1216 ; VI-NEXT: v_mov_b32_e32 v0, s0
1217 ; VI-NEXT: v_mov_b32_e32 v1, s1
1218 ; VI-NEXT: flat_store_dword v[0:1], v2
1221 ; GFX9-LABEL: v_test_canonicalize_var_v2f16:
1223 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1224 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1225 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1227 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1228 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1229 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
1230 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
1231 ; GFX9-NEXT: s_endpgm
1233 ; CI-LABEL: v_test_canonicalize_var_v2f16:
1235 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1236 ; CI-NEXT: s_mov_b32 s3, 0xf000
1237 ; CI-NEXT: s_mov_b32 s6, 0
1238 ; CI-NEXT: s_mov_b32 s7, s3
1239 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1240 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1241 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
1242 ; CI-NEXT: v_mov_b32_e32 v1, 0
1243 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1244 ; CI-NEXT: s_mov_b32 s2, -1
1245 ; CI-NEXT: s_waitcnt vmcnt(0)
1246 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1247 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1248 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1249 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1250 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1251 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1252 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1253 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1254 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
1255 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1258 ; GFX11-LABEL: v_test_canonicalize_var_v2f16:
1260 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1261 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1262 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1263 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1264 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1265 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
1266 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1267 ; GFX11-NEXT: s_nop 0
1268 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1269 ; GFX11-NEXT: s_endpgm
1270 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1271 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1272 %val = load <2 x half>, ptr addrspace(1) %gep
1273 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
1274 store <2 x half> %canonicalized, ptr addrspace(1) %out
1278 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
1279 ; VI-LABEL: v_test_canonicalize_fabs_var_v2f16:
1281 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1282 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1283 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1284 ; VI-NEXT: v_mov_b32_e32 v1, s1
1285 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1286 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1287 ; VI-NEXT: flat_load_dword v0, v[0:1]
1288 ; VI-NEXT: s_waitcnt vmcnt(0)
1289 ; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1290 ; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0|
1291 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
1292 ; VI-NEXT: v_mov_b32_e32 v0, s0
1293 ; VI-NEXT: v_mov_b32_e32 v1, s1
1294 ; VI-NEXT: flat_store_dword v[0:1], v2
1297 ; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16:
1299 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1300 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1301 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1303 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1304 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1305 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1306 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
1307 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
1308 ; GFX9-NEXT: s_endpgm
1310 ; CI-LABEL: v_test_canonicalize_fabs_var_v2f16:
1312 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1313 ; CI-NEXT: s_mov_b32 s3, 0xf000
1314 ; CI-NEXT: s_mov_b32 s6, 0
1315 ; CI-NEXT: s_mov_b32 s7, s3
1316 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1317 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1318 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
1319 ; CI-NEXT: v_mov_b32_e32 v1, 0
1320 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1321 ; CI-NEXT: s_mov_b32 s2, -1
1322 ; CI-NEXT: s_waitcnt vmcnt(0)
1323 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1324 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
1325 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
1326 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1327 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1328 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1329 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1330 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1331 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
1332 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1335 ; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16:
1337 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1338 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1339 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1340 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1341 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1342 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1344 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
1345 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1346 ; GFX11-NEXT: s_nop 0
1347 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1348 ; GFX11-NEXT: s_endpgm
1349 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1350 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1351 %val = load <2 x half>, ptr addrspace(1) %gep
1352 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
1353 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
1354 store <2 x half> %canonicalized, ptr addrspace(1) %out
1358 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
1359 ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1361 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1362 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1363 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1364 ; VI-NEXT: v_mov_b32_e32 v1, s1
1365 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1366 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1367 ; VI-NEXT: flat_load_dword v0, v[0:1]
1368 ; VI-NEXT: s_waitcnt vmcnt(0)
1369 ; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1370 ; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0|
1371 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
1372 ; VI-NEXT: v_mov_b32_e32 v0, s0
1373 ; VI-NEXT: v_mov_b32_e32 v1, s1
1374 ; VI-NEXT: flat_store_dword v[0:1], v2
1377 ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1379 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1380 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1381 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1382 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1383 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1384 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1385 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1386 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1387 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
1388 ; GFX9-NEXT: s_endpgm
1390 ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1392 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1393 ; CI-NEXT: s_mov_b32 s3, 0xf000
1394 ; CI-NEXT: s_mov_b32 s6, 0
1395 ; CI-NEXT: s_mov_b32 s7, s3
1396 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1397 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1398 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
1399 ; CI-NEXT: v_mov_b32_e32 v1, 0
1400 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1401 ; CI-NEXT: s_mov_b32 s2, -1
1402 ; CI-NEXT: s_waitcnt vmcnt(0)
1403 ; CI-NEXT: v_or_b32_e32 v0, 0x80008000, v0
1404 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1405 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1406 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1407 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1408 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1409 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1410 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1411 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1412 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
1413 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1416 ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1418 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1419 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1420 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1421 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1422 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1423 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1424 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1425 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1426 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1427 ; GFX11-NEXT: s_nop 0
1428 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1429 ; GFX11-NEXT: s_endpgm
1430 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1431 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1432 %val = load <2 x half>, ptr addrspace(1) %gep
1433 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
1434 %val.fabs.fneg = fneg <2 x half> %val.fabs
1435 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
1436 store <2 x half> %canonicalized, ptr addrspace(1) %out
1440 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 {
1441 ; VI-LABEL: v_test_canonicalize_fneg_var_v2f16:
1443 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1444 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1445 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1446 ; VI-NEXT: v_mov_b32_e32 v1, s1
1447 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1448 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1449 ; VI-NEXT: flat_load_dword v0, v[0:1]
1450 ; VI-NEXT: s_waitcnt vmcnt(0)
1451 ; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1452 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1453 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
1454 ; VI-NEXT: v_mov_b32_e32 v0, s0
1455 ; VI-NEXT: v_mov_b32_e32 v1, s1
1456 ; VI-NEXT: flat_store_dword v[0:1], v2
1459 ; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16:
1461 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1462 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1463 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1464 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1465 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1466 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1467 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1468 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
1469 ; GFX9-NEXT: s_endpgm
1471 ; CI-LABEL: v_test_canonicalize_fneg_var_v2f16:
1473 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1474 ; CI-NEXT: s_mov_b32 s3, 0xf000
1475 ; CI-NEXT: s_mov_b32 s6, 0
1476 ; CI-NEXT: s_mov_b32 s7, s3
1477 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1478 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1479 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
1480 ; CI-NEXT: v_mov_b32_e32 v1, 0
1481 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1482 ; CI-NEXT: s_mov_b32 s2, -1
1483 ; CI-NEXT: s_waitcnt vmcnt(0)
1484 ; CI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1485 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1486 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1487 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1488 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1489 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1490 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1491 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1492 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1493 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
1494 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1497 ; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16:
1499 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1500 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1501 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1502 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1503 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1504 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1505 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1506 ; GFX11-NEXT: s_nop 0
1507 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1508 ; GFX11-NEXT: s_endpgm
1509 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1510 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1511 %val = load <2 x half>, ptr addrspace(1) %gep
1512 %fneg.val = fneg <2 x half> %val
1513 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
1514 store <2 x half> %canonicalized, ptr addrspace(1) %out
1518 define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
1519 ; VI-LABEL: s_test_canonicalize_var_v2f16:
1521 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1522 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1523 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1524 ; VI-NEXT: s_lshr_b32 s3, s2, 16
1525 ; VI-NEXT: v_mov_b32_e32 v1, s3
1526 ; VI-NEXT: v_max_f16_e64 v0, s2, s2
1527 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1528 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
1529 ; VI-NEXT: v_mov_b32_e32 v0, s0
1530 ; VI-NEXT: v_mov_b32_e32 v1, s1
1531 ; VI-NEXT: flat_store_dword v[0:1], v2
1534 ; GFX9-LABEL: s_test_canonicalize_var_v2f16:
1536 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1537 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1538 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1539 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1540 ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
1541 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
1542 ; GFX9-NEXT: s_endpgm
1544 ; CI-LABEL: s_test_canonicalize_var_v2f16:
1546 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb
1547 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1548 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1549 ; CI-NEXT: s_lshr_b32 s3, s2, 16
1550 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
1551 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
1552 ; CI-NEXT: s_mov_b32 s3, 0xf000
1553 ; CI-NEXT: s_mov_b32 s2, -1
1554 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1555 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1556 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1557 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1558 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1559 ; CI-NEXT: v_or_b32_e32 v0, v1, v0
1560 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1563 ; GFX11-LABEL: s_test_canonicalize_var_v2f16:
1565 ; GFX11-NEXT: s_clause 0x1
1566 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
1567 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1568 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1569 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1570 ; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
1571 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1572 ; GFX11-NEXT: s_nop 0
1573 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1574 ; GFX11-NEXT: s_endpgm
1575 %val = bitcast i32 %val.arg to <2 x half>
1576 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
1577 store <2 x half> %canonicalized, ptr addrspace(1) %out
1581 define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 {
1582 ; VI-LABEL: test_fold_canonicalize_p0_v2f16:
1584 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1585 ; VI-NEXT: v_mov_b32_e32 v2, 0
1586 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1587 ; VI-NEXT: v_mov_b32_e32 v0, s0
1588 ; VI-NEXT: v_mov_b32_e32 v1, s1
1589 ; VI-NEXT: flat_store_dword v[0:1], v2
1592 ; GFX9-LABEL: test_fold_canonicalize_p0_v2f16:
1594 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1595 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1596 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1597 ; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
1598 ; GFX9-NEXT: s_endpgm
1600 ; CI-LABEL: test_fold_canonicalize_p0_v2f16:
1602 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1603 ; CI-NEXT: s_mov_b32 s3, 0xf000
1604 ; CI-NEXT: s_mov_b32 s2, -1
1605 ; CI-NEXT: v_mov_b32_e32 v0, 0
1606 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1607 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1610 ; GFX11-LABEL: test_fold_canonicalize_p0_v2f16:
1612 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1613 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1614 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1615 ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
1616 ; GFX11-NEXT: s_nop 0
1617 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1618 ; GFX11-NEXT: s_endpgm
1619 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
1620 store <2 x half> %canonicalized, ptr addrspace(1) %out
1624 define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 {
1625 ; VI-LABEL: test_fold_canonicalize_n0_v2f16:
1627 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1628 ; VI-NEXT: v_mov_b32_e32 v2, 0x80008000
1629 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1630 ; VI-NEXT: v_mov_b32_e32 v0, s0
1631 ; VI-NEXT: v_mov_b32_e32 v1, s1
1632 ; VI-NEXT: flat_store_dword v[0:1], v2
1635 ; GFX9-LABEL: test_fold_canonicalize_n0_v2f16:
1637 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1638 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1639 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000
1640 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1641 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1642 ; GFX9-NEXT: s_endpgm
1644 ; CI-LABEL: test_fold_canonicalize_n0_v2f16:
1646 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1647 ; CI-NEXT: s_mov_b32 s3, 0xf000
1648 ; CI-NEXT: s_mov_b32 s2, -1
1649 ; CI-NEXT: v_mov_b32_e32 v0, 0x80008000
1650 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1651 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1654 ; GFX11-LABEL: test_fold_canonicalize_n0_v2f16:
1656 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1657 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
1658 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1659 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1660 ; GFX11-NEXT: s_nop 0
1661 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1662 ; GFX11-NEXT: s_endpgm
1663 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
1664 store <2 x half> %canonicalized, ptr addrspace(1) %out
1668 define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 {
1669 ; VI-LABEL: test_fold_canonicalize_p1_v2f16:
1671 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1672 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00
1673 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1674 ; VI-NEXT: v_mov_b32_e32 v0, s0
1675 ; VI-NEXT: v_mov_b32_e32 v1, s1
1676 ; VI-NEXT: flat_store_dword v[0:1], v2
1679 ; GFX9-LABEL: test_fold_canonicalize_p1_v2f16:
1681 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1682 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1683 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00
1684 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1685 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1686 ; GFX9-NEXT: s_endpgm
1688 ; CI-LABEL: test_fold_canonicalize_p1_v2f16:
1690 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1691 ; CI-NEXT: s_mov_b32 s3, 0xf000
1692 ; CI-NEXT: s_mov_b32 s2, -1
1693 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c003c00
1694 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1695 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1698 ; GFX11-LABEL: test_fold_canonicalize_p1_v2f16:
1700 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1701 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
1702 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1703 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1704 ; GFX11-NEXT: s_nop 0
1705 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1706 ; GFX11-NEXT: s_endpgm
1707 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
1708 store <2 x half> %canonicalized, ptr addrspace(1) %out
1712 define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 {
1713 ; VI-LABEL: test_fold_canonicalize_n1_v2f16:
1715 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1716 ; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00
1717 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1718 ; VI-NEXT: v_mov_b32_e32 v0, s0
1719 ; VI-NEXT: v_mov_b32_e32 v1, s1
1720 ; VI-NEXT: flat_store_dword v[0:1], v2
1723 ; GFX9-LABEL: test_fold_canonicalize_n1_v2f16:
1725 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1726 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1727 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00
1728 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1729 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1730 ; GFX9-NEXT: s_endpgm
1732 ; CI-LABEL: test_fold_canonicalize_n1_v2f16:
1734 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1735 ; CI-NEXT: s_mov_b32 s3, 0xf000
1736 ; CI-NEXT: s_mov_b32 s2, -1
1737 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00
1738 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1739 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1742 ; GFX11-LABEL: test_fold_canonicalize_n1_v2f16:
1744 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1745 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
1746 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1747 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1748 ; GFX11-NEXT: s_nop 0
1749 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1750 ; GFX11-NEXT: s_endpgm
1751 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
1752 store <2 x half> %canonicalized, ptr addrspace(1) %out
1756 define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 {
1757 ; VI-LABEL: test_fold_canonicalize_literal_v2f16:
1759 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1760 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00
1761 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1762 ; VI-NEXT: v_mov_b32_e32 v0, s0
1763 ; VI-NEXT: v_mov_b32_e32 v1, s1
1764 ; VI-NEXT: flat_store_dword v[0:1], v2
1767 ; GFX9-LABEL: test_fold_canonicalize_literal_v2f16:
1769 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1770 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1771 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00
1772 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1773 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1774 ; GFX9-NEXT: s_endpgm
1776 ; CI-LABEL: test_fold_canonicalize_literal_v2f16:
1778 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1779 ; CI-NEXT: s_mov_b32 s3, 0xf000
1780 ; CI-NEXT: s_mov_b32 s2, -1
1781 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c004c00
1782 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1783 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1786 ; GFX11-LABEL: test_fold_canonicalize_literal_v2f16:
1788 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1789 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
1790 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1791 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1792 ; GFX11-NEXT: s_nop 0
1793 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1794 ; GFX11-NEXT: s_endpgm
1795 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
1796 store <2 x half> %canonicalized, ptr addrspace(1) %out
1800 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 {
1801 ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1803 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1804 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
1805 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1806 ; VI-NEXT: v_mov_b32_e32 v0, s0
1807 ; VI-NEXT: v_mov_b32_e32 v1, s1
1808 ; VI-NEXT: flat_store_dword v[0:1], v2
1811 ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1813 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1814 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1815 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
1816 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1817 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1818 ; GFX9-NEXT: s_endpgm
1820 ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1822 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1823 ; CI-NEXT: s_mov_b32 s3, 0xf000
1824 ; CI-NEXT: s_mov_b32 s2, -1
1825 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff
1826 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1827 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1830 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1832 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1833 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
1834 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1835 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1836 ; GFX11-NEXT: s_nop 0
1837 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1838 ; GFX11-NEXT: s_endpgm
1839 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
1840 store <2 x half> %canonicalized, ptr addrspace(1) %out
1844 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 {
1845 ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1847 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1848 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
1849 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1850 ; VI-NEXT: v_mov_b32_e32 v0, s0
1851 ; VI-NEXT: v_mov_b32_e32 v1, s1
1852 ; VI-NEXT: flat_store_dword v[0:1], v2
1855 ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1857 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1858 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1859 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
1860 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1861 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1862 ; GFX9-NEXT: s_endpgm
1864 ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1866 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1867 ; CI-NEXT: s_mov_b32 s3, 0xf000
1868 ; CI-NEXT: s_mov_b32 s2, -1
1869 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff
1870 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1871 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1874 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1876 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1877 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
1878 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1879 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1880 ; GFX11-NEXT: s_nop 0
1881 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1882 ; GFX11-NEXT: s_endpgm
1883 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
1884 store <2 x half> %canonicalized, ptr addrspace(1) %out
1888 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 {
1889 ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1891 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1892 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
1893 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1894 ; VI-NEXT: v_mov_b32_e32 v0, s0
1895 ; VI-NEXT: v_mov_b32_e32 v1, s1
1896 ; VI-NEXT: flat_store_dword v[0:1], v2
1899 ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1901 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1902 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1903 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
1904 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1905 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1906 ; GFX9-NEXT: s_endpgm
1908 ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1910 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1911 ; CI-NEXT: s_mov_b32 s3, 0xf000
1912 ; CI-NEXT: s_mov_b32 s2, -1
1913 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff
1914 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1915 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1918 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1920 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1921 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
1922 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1923 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1924 ; GFX11-NEXT: s_nop 0
1925 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1926 ; GFX11-NEXT: s_endpgm
1927 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
1928 store <2 x half> %canonicalized, ptr addrspace(1) %out
1932 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 {
1933 ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1935 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1936 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
1937 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1938 ; VI-NEXT: v_mov_b32_e32 v0, s0
1939 ; VI-NEXT: v_mov_b32_e32 v1, s1
1940 ; VI-NEXT: flat_store_dword v[0:1], v2
1943 ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1945 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1946 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1947 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
1948 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1949 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1950 ; GFX9-NEXT: s_endpgm
1952 ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1954 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1955 ; CI-NEXT: s_mov_b32 s3, 0xf000
1956 ; CI-NEXT: s_mov_b32 s2, -1
1957 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff
1958 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1959 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1962 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1964 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1965 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
1966 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1967 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1968 ; GFX11-NEXT: s_nop 0
1969 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1970 ; GFX11-NEXT: s_endpgm
1971 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
1972 store <2 x half> %canonicalized, ptr addrspace(1) %out
1976 define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 {
1977 ; VI-LABEL: test_fold_canonicalize_qnan_v2f16:
1979 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1980 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00
1981 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1982 ; VI-NEXT: v_mov_b32_e32 v0, s0
1983 ; VI-NEXT: v_mov_b32_e32 v1, s1
1984 ; VI-NEXT: flat_store_dword v[0:1], v2
1987 ; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16:
1989 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1990 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1991 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00
1992 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1993 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1994 ; GFX9-NEXT: s_endpgm
1996 ; CI-LABEL: test_fold_canonicalize_qnan_v2f16:
1998 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1999 ; CI-NEXT: s_mov_b32 s3, 0xf000
2000 ; CI-NEXT: s_mov_b32 s2, -1
2001 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c007c00
2002 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2003 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2006 ; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16:
2008 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2009 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
2010 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2011 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2012 ; GFX11-NEXT: s_nop 0
2013 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2014 ; GFX11-NEXT: s_endpgm
2015 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
2016 store <2 x half> %canonicalized, ptr addrspace(1) %out
2020 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 {
2021 ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2023 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2024 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
2025 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2026 ; VI-NEXT: v_mov_b32_e32 v0, s0
2027 ; VI-NEXT: v_mov_b32_e32 v1, s1
2028 ; VI-NEXT: flat_store_dword v[0:1], v2
2031 ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2033 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2034 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2035 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2036 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2037 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2038 ; GFX9-NEXT: s_endpgm
2040 ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2042 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2043 ; CI-NEXT: s_mov_b32 s3, 0xf000
2044 ; CI-NEXT: s_mov_b32 s2, -1
2045 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2046 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2047 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2050 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2052 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2053 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2054 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2055 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2056 ; GFX11-NEXT: s_nop 0
2057 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2058 ; GFX11-NEXT: s_endpgm
2059 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
2060 store <2 x half> %canonicalized, ptr addrspace(1) %out
2064 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 {
2065 ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2067 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2068 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
2069 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2070 ; VI-NEXT: v_mov_b32_e32 v0, s0
2071 ; VI-NEXT: v_mov_b32_e32 v1, s1
2072 ; VI-NEXT: flat_store_dword v[0:1], v2
2075 ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2077 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2078 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2079 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2080 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2081 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2082 ; GFX9-NEXT: s_endpgm
2084 ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2086 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2087 ; CI-NEXT: s_mov_b32 s3, 0xf000
2088 ; CI-NEXT: s_mov_b32 s2, -1
2089 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2090 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2091 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2094 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2096 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2097 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2098 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2099 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2100 ; GFX11-NEXT: s_nop 0
2101 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2102 ; GFX11-NEXT: s_endpgm
2103 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
2104 store <2 x half> %canonicalized, ptr addrspace(1) %out
2108 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 {
2109 ; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2111 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2112 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
2113 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2114 ; VI-NEXT: v_mov_b32_e32 v0, s0
2115 ; VI-NEXT: v_mov_b32_e32 v1, s1
2116 ; VI-NEXT: flat_store_dword v[0:1], v2
2119 ; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2121 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2122 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2123 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2124 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2125 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2126 ; GFX9-NEXT: s_endpgm
2128 ; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2130 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2131 ; CI-NEXT: s_mov_b32 s3, 0xf000
2132 ; CI-NEXT: s_mov_b32 s2, -1
2133 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2134 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2135 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2138 ; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2140 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2141 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2142 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2143 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2144 ; GFX11-NEXT: s_nop 0
2145 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2146 ; GFX11-NEXT: s_endpgm
2147 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
2148 store <2 x half> %canonicalized, ptr addrspace(1) %out
2152 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 {
2153 ; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2155 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2156 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
2157 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2158 ; VI-NEXT: v_mov_b32_e32 v0, s0
2159 ; VI-NEXT: v_mov_b32_e32 v1, s1
2160 ; VI-NEXT: flat_store_dword v[0:1], v2
2163 ; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2165 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2166 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2167 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2169 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2170 ; GFX9-NEXT: s_endpgm
2172 ; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2174 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2175 ; CI-NEXT: s_mov_b32 s3, 0xf000
2176 ; CI-NEXT: s_mov_b32 s2, -1
2177 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2178 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2179 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2182 ; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2184 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2185 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2186 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2187 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2188 ; GFX11-NEXT: s_nop 0
2189 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2190 ; GFX11-NEXT: s_endpgm
2191 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
2192 store <2 x half> %canonicalized, ptr addrspace(1) %out
2196 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 {
2197 ; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2199 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2200 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
2201 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2202 ; VI-NEXT: v_mov_b32_e32 v0, s0
2203 ; VI-NEXT: v_mov_b32_e32 v1, s1
2204 ; VI-NEXT: flat_store_dword v[0:1], v2
2207 ; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2209 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2210 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2211 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2212 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2213 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2214 ; GFX9-NEXT: s_endpgm
2216 ; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2218 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2219 ; CI-NEXT: s_mov_b32 s3, 0xf000
2220 ; CI-NEXT: s_mov_b32 s2, -1
2221 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2222 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2223 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2226 ; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2228 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2229 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2230 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2231 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2232 ; GFX11-NEXT: s_nop 0
2233 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2234 ; GFX11-NEXT: s_endpgm
2235 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
2236 store <2 x half> %canonicalized, ptr addrspace(1) %out
2240 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 {
2241 ; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2243 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2244 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
2245 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2246 ; VI-NEXT: v_mov_b32_e32 v0, s0
2247 ; VI-NEXT: v_mov_b32_e32 v1, s1
2248 ; VI-NEXT: flat_store_dword v[0:1], v2
2251 ; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2253 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2254 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2255 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2256 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2257 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2258 ; GFX9-NEXT: s_endpgm
2260 ; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2262 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2263 ; CI-NEXT: s_mov_b32 s3, 0xf000
2264 ; CI-NEXT: s_mov_b32 s2, -1
2265 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2266 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2267 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2270 ; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2272 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2273 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2274 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2275 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2276 ; GFX11-NEXT: s_nop 0
2277 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2278 ; GFX11-NEXT: s_endpgm
2279 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
2280 store <2 x half> %canonicalized, ptr addrspace(1) %out
2284 define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
2285 ; VI-LABEL: v_test_canonicalize_var_v3f16:
2287 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2288 ; VI-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2289 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2290 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
2291 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
2292 ; VI-NEXT: s_setpc_b64 s[30:31]
2294 ; GFX9-LABEL: v_test_canonicalize_var_v3f16:
2296 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2297 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
2298 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2299 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2301 ; CI-LABEL: v_test_canonicalize_var_v3f16:
2303 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2304 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2305 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2306 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2307 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2308 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2309 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2310 ; CI-NEXT: s_setpc_b64 s[30:31]
2312 ; GFX11-LABEL: v_test_canonicalize_var_v3f16:
2314 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2315 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2316 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2317 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2318 %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
2319 ret <3 x half> %canonicalized
2322 define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
2323 ; VI-LABEL: v_test_canonicalize_var_v4f16:
2325 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2326 ; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2327 ; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2328 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
2329 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2330 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
2331 ; VI-NEXT: v_or_b32_e32 v1, v1, v2
2332 ; VI-NEXT: s_setpc_b64 s[30:31]
2334 ; GFX9-LABEL: v_test_canonicalize_var_v4f16:
2336 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2337 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
2338 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2339 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2341 ; CI-LABEL: v_test_canonicalize_var_v4f16:
2343 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2344 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2345 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2346 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2347 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2348 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2349 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2350 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2351 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2352 ; CI-NEXT: s_setpc_b64 s[30:31]
2354 ; GFX11-LABEL: v_test_canonicalize_var_v4f16:
2356 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2357 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2358 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2359 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2360 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
2361 ret <4 x half> %canonicalized
2364 define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 {
2365 ; VI-LABEL: s_test_canonicalize_undef_v2f16:
2367 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2368 ; VI-NEXT: v_mov_b32_e32 v2, 0
2369 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2370 ; VI-NEXT: v_mov_b32_e32 v0, s0
2371 ; VI-NEXT: v_mov_b32_e32 v1, s1
2372 ; VI-NEXT: flat_store_dword v[0:1], v2
2375 ; GFX9-LABEL: s_test_canonicalize_undef_v2f16:
2377 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2378 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2379 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2380 ; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
2381 ; GFX9-NEXT: s_endpgm
2383 ; CI-LABEL: s_test_canonicalize_undef_v2f16:
2385 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2386 ; CI-NEXT: s_mov_b32 s3, 0xf000
2387 ; CI-NEXT: s_mov_b32 s2, -1
2388 ; CI-NEXT: v_mov_b32_e32 v0, 0
2389 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2390 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2393 ; GFX11-LABEL: s_test_canonicalize_undef_v2f16:
2395 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2396 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2397 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2398 ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
2399 ; GFX11-NEXT: s_nop 0
2400 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2401 ; GFX11-NEXT: s_endpgm
2402 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
2403 store <2 x half> %canonicalized, ptr addrspace(1) %out
2407 define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
2408 ; VI-LABEL: v_test_canonicalize_reg_undef_v2f16:
2410 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2411 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2412 ; VI-NEXT: s_setpc_b64 s[30:31]
2414 ; GFX9-LABEL: v_test_canonicalize_reg_undef_v2f16:
2416 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
2418 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
2419 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2421 ; CI-LABEL: v_test_canonicalize_reg_undef_v2f16:
2423 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2424 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2425 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
2426 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2427 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
2428 ; CI-NEXT: s_setpc_b64 s[30:31]
2430 ; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
2432 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2433 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2434 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2435 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0
2436 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2437 %vec = insertelement <2 x half> undef, half %val, i32 0
2438 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2439 ret <2 x half> %canonicalized
2442 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
2443 ; VI-LABEL: v_test_canonicalize_undef_reg_v2f16:
2445 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2446 ; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2447 ; VI-NEXT: s_setpc_b64 s[30:31]
2449 ; GFX9-LABEL: v_test_canonicalize_undef_reg_v2f16:
2451 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452 ; GFX9-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2453 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2455 ; CI-LABEL: v_test_canonicalize_undef_reg_v2f16:
2457 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2458 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2459 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2460 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0
2461 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
2462 ; CI-NEXT: s_setpc_b64 s[30:31]
2464 ; GFX11-LABEL: v_test_canonicalize_undef_reg_v2f16:
2466 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2467 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2468 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2469 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2470 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2471 %vec = insertelement <2 x half> undef, half %val, i32 1
2472 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2473 ret <2 x half> %canonicalized
2476 define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
2477 ; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2479 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2480 ; VI-NEXT: v_bfrev_b32_e32 v0, 60
2481 ; VI-NEXT: s_setpc_b64 s[30:31]
2483 ; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2485 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2486 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 60
2487 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2489 ; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2491 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2492 ; CI-NEXT: v_mov_b32_e32 v0, 0
2493 ; CI-NEXT: v_mov_b32_e32 v1, 1.0
2494 ; CI-NEXT: s_setpc_b64 s[30:31]
2496 ; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2498 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2499 ; GFX11-NEXT: v_bfrev_b32_e32 v0, 60
2500 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2501 %vec = insertelement <2 x half> undef, half 1.0, i32 1
2502 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2503 ret <2 x half> %canonicalized
2506 define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
2507 ; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2509 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2510 ; VI-NEXT: v_mov_b32_e32 v0, 0x3c00
2511 ; VI-NEXT: s_setpc_b64 s[30:31]
2513 ; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2515 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2516 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
2517 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2519 ; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2521 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2522 ; CI-NEXT: v_mov_b32_e32 v0, 1.0
2523 ; CI-NEXT: v_mov_b32_e32 v1, 0
2524 ; CI-NEXT: s_setpc_b64 s[30:31]
2526 ; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2528 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2529 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00
2530 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2531 %vec = insertelement <2 x half> undef, half 1.0, i32 0
2532 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2533 ret <2 x half> %canonicalized
2536 define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
2537 ; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2539 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2540 ; VI-NEXT: v_bfrev_b32_e32 v0, 50
2541 ; VI-NEXT: s_setpc_b64 s[30:31]
2543 ; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2545 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2546 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 50
2547 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2549 ; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2551 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2552 ; CI-NEXT: v_mov_b32_e32 v0, 0
2553 ; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
2554 ; CI-NEXT: s_setpc_b64 s[30:31]
2556 ; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2558 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2559 ; GFX11-NEXT: v_bfrev_b32_e32 v0, 50
2560 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2561 %vec = insertelement <2 x half> undef, half 16.0, i32 1
2562 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2563 ret <2 x half> %canonicalized
2566 define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
2567 ; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2569 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2570 ; VI-NEXT: v_mov_b32_e32 v0, 0x4c00
2571 ; VI-NEXT: s_setpc_b64 s[30:31]
2573 ; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2575 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2576 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4c00
2577 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2579 ; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2581 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2582 ; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
2583 ; CI-NEXT: v_mov_b32_e32 v1, 0
2584 ; CI-NEXT: s_setpc_b64 s[30:31]
2586 ; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2588 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2589 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4c00
2590 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2591 %vec = insertelement <2 x half> undef, half 16.0, i32 0
2592 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2593 ret <2 x half> %canonicalized
2596 define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
2597 ; VI-LABEL: v_test_canonicalize_reg_k_v2f16:
2599 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2600 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2601 ; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
2602 ; VI-NEXT: s_setpc_b64 s[30:31]
2604 ; GFX9-LABEL: v_test_canonicalize_reg_k_v2f16:
2606 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2607 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
2608 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, 2.0
2609 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2611 ; CI-LABEL: v_test_canonicalize_reg_k_v2f16:
2613 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2614 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2615 ; CI-NEXT: v_mov_b32_e32 v1, 2.0
2616 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2617 ; CI-NEXT: s_setpc_b64 s[30:31]
2619 ; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16:
2621 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2622 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2623 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2624 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, 2.0
2625 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2626 %vec0 = insertelement <2 x half> undef, half %val, i32 0
2627 %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
2628 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
2629 ret <2 x half> %canonicalized
2632 define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
2633 ; VI-LABEL: v_test_canonicalize_k_reg_v2f16:
2635 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636 ; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2637 ; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
2638 ; VI-NEXT: s_setpc_b64 s[30:31]
2640 ; GFX9-LABEL: v_test_canonicalize_k_reg_v2f16:
2642 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2643 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
2644 ; GFX9-NEXT: v_pack_b32_f16 v0, 2.0, v0
2645 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2647 ; CI-LABEL: v_test_canonicalize_k_reg_v2f16:
2649 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2650 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2651 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
2652 ; CI-NEXT: v_mov_b32_e32 v0, 2.0
2653 ; CI-NEXT: s_setpc_b64 s[30:31]
2655 ; GFX11-LABEL: v_test_canonicalize_k_reg_v2f16:
2657 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2658 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2659 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2660 ; GFX11-NEXT: v_pack_b32_f16 v0, 2.0, v0
2661 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2662 %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
2663 %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
2664 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
2665 ret <2 x half> %canonicalized
2668 define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 {
2669 ; VI-LABEL: s_test_canonicalize_undef_v4f16:
2671 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2672 ; VI-NEXT: v_mov_b32_e32 v0, 0
2673 ; VI-NEXT: v_mov_b32_e32 v1, v0
2674 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2675 ; VI-NEXT: v_mov_b32_e32 v3, s1
2676 ; VI-NEXT: v_mov_b32_e32 v2, s0
2677 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2680 ; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
2682 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2683 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2684 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2685 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2686 ; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
2687 ; GFX9-NEXT: s_endpgm
2689 ; CI-LABEL: s_test_canonicalize_undef_v4f16:
2691 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2692 ; CI-NEXT: v_mov_b32_e32 v0, 0
2693 ; CI-NEXT: s_mov_b32 s3, 0xf000
2694 ; CI-NEXT: s_mov_b32 s2, -1
2695 ; CI-NEXT: v_mov_b32_e32 v1, v0
2696 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2697 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2700 ; GFX11-LABEL: s_test_canonicalize_undef_v4f16:
2702 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2703 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2704 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2705 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
2706 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2707 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
2708 ; GFX11-NEXT: s_nop 0
2709 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2710 ; GFX11-NEXT: s_endpgm
2711 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
2712 store <4 x half> %canonicalized, ptr addrspace(1) %out
2716 define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
2717 ; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2719 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2720 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2721 ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
2722 ; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2723 ; VI-NEXT: s_setpc_b64 s[30:31]
2725 ; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2727 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2728 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
2729 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
2730 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2731 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2733 ; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2735 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2737 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
2738 ; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
2739 ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
2740 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2741 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
2742 ; CI-NEXT: s_setpc_b64 s[30:31]
2744 ; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2746 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2747 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2748 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2749 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2750 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0
2751 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2752 %vec = insertelement <4 x half> undef, half %val, i32 0
2753 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
2754 ret <4 x half> %canonicalized
2757 define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
2758 ; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2760 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2761 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2762 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2763 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
2764 ; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2765 ; VI-NEXT: s_setpc_b64 s[30:31]
2767 ; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2769 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2770 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
2771 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
2772 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
2773 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2774 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2776 ; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2778 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2779 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2780 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2781 ; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
2782 ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
2783 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2784 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2785 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
2786 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
2787 ; CI-NEXT: s_setpc_b64 s[30:31]
2789 ; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2791 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2792 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
2793 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
2794 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2795 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2796 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2797 %vec0 = insertelement <4 x half> undef, half %val0, i32 0
2798 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
2799 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
2800 ret <4 x half> %canonicalized
2803 define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
2804 ; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2806 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2807 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
2808 ; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2809 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2810 ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
2811 ; VI-NEXT: v_or_b32_e32 v1, v1, v2
2812 ; VI-NEXT: s_setpc_b64 s[30:31]
2814 ; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2816 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2817 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
2818 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
2819 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
2820 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2821 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
2822 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2824 ; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2826 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2827 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2828 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2829 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2830 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2831 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2832 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
2833 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
2834 ; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1
2835 ; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
2836 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
2837 ; CI-NEXT: s_setpc_b64 s[30:31]
2839 ; GFX11-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2841 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2842 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2843 ; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
2844 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2845 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0
2846 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2847 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2848 %vec0 = insertelement <4 x half> undef, half %val0, i32 0
2849 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
2850 %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
2851 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
2852 ret <4 x half> %canonicalized
2855 define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
2856 ; VI-LABEL: v_test_canonicalize_var_v6f16:
2858 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2859 ; VI-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2860 ; VI-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2861 ; VI-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2862 ; VI-NEXT: v_max_f16_e32 v2, v2, v2
2863 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
2864 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2865 ; VI-NEXT: v_or_b32_e32 v0, v0, v5
2866 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
2867 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
2868 ; VI-NEXT: s_setpc_b64 s[30:31]
2870 ; GFX9-LABEL: v_test_canonicalize_var_v6f16:
2872 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2873 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
2874 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2875 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
2876 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2878 ; CI-LABEL: v_test_canonicalize_var_v6f16:
2880 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2881 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2882 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2883 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2884 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2885 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2886 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
2887 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2888 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2889 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2890 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2891 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
2892 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
2893 ; CI-NEXT: s_setpc_b64 s[30:31]
2895 ; GFX11-LABEL: v_test_canonicalize_var_v6f16:
2897 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2898 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2899 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2900 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
2901 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2902 %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val)
2903 ret <6 x half> %canonicalized
2906 define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
2907 ; VI-LABEL: v_test_canonicalize_var_v8f16:
2909 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2910 ; VI-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2911 ; VI-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2912 ; VI-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2913 ; VI-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2914 ; VI-NEXT: v_max_f16_e32 v3, v3, v3
2915 ; VI-NEXT: v_max_f16_e32 v2, v2, v2
2916 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
2917 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2918 ; VI-NEXT: v_or_b32_e32 v0, v0, v7
2919 ; VI-NEXT: v_or_b32_e32 v1, v1, v6
2920 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
2921 ; VI-NEXT: v_or_b32_e32 v3, v3, v4
2922 ; VI-NEXT: s_setpc_b64 s[30:31]
2924 ; GFX9-LABEL: v_test_canonicalize_var_v8f16:
2926 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2927 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
2928 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2929 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
2930 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v3
2931 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2933 ; CI-LABEL: v_test_canonicalize_var_v8f16:
2935 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2936 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2937 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2938 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2939 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2940 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2941 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
2942 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
2943 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2944 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2945 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2946 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2947 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2948 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
2949 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
2950 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
2951 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
2952 ; CI-NEXT: s_setpc_b64 s[30:31]
2954 ; GFX11-LABEL: v_test_canonicalize_var_v8f16:
2956 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2957 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
2958 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2959 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
2960 ; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
2961 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2962 %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val)
2963 ret <8 x half> %canonicalized
2966 define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
2967 ; VI-LABEL: v_test_canonicalize_var_v12f16:
2969 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2970 ; VI-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2971 ; VI-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2972 ; VI-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2973 ; VI-NEXT: v_max_f16_sdwa v9, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2974 ; VI-NEXT: v_max_f16_sdwa v10, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2975 ; VI-NEXT: v_max_f16_sdwa v11, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2976 ; VI-NEXT: v_max_f16_e32 v5, v5, v5
2977 ; VI-NEXT: v_max_f16_e32 v4, v4, v4
2978 ; VI-NEXT: v_max_f16_e32 v3, v3, v3
2979 ; VI-NEXT: v_max_f16_e32 v2, v2, v2
2980 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
2981 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
2982 ; VI-NEXT: v_or_b32_e32 v0, v0, v11
2983 ; VI-NEXT: v_or_b32_e32 v1, v1, v10
2984 ; VI-NEXT: v_or_b32_e32 v2, v2, v9
2985 ; VI-NEXT: v_or_b32_e32 v3, v3, v8
2986 ; VI-NEXT: v_or_b32_e32 v4, v4, v7
2987 ; VI-NEXT: v_or_b32_e32 v5, v5, v6
2988 ; VI-NEXT: s_setpc_b64 s[30:31]
2990 ; GFX9-LABEL: v_test_canonicalize_var_v12f16:
2992 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2993 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
2994 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2995 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
2996 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v3
2997 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v4
2998 ; GFX9-NEXT: v_pk_max_f16 v5, v5, v5
2999 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3001 ; CI-LABEL: v_test_canonicalize_var_v12f16:
3003 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3004 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
3005 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
3006 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3007 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3008 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
3009 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
3010 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
3011 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
3012 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
3013 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
3014 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
3015 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3016 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
3017 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
3018 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3019 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
3020 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
3021 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
3022 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
3023 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
3024 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
3025 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
3026 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
3027 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
3028 ; CI-NEXT: s_setpc_b64 s[30:31]
3030 ; GFX11-LABEL: v_test_canonicalize_var_v12f16:
3032 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3033 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
3034 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
3035 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
3036 ; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
3037 ; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
3038 ; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
3039 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3040 %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val)
3041 ret <12 x half> %canonicalized
3044 define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
3045 ; VI-LABEL: v_test_canonicalize_var_v16f16:
3047 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3048 ; VI-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3049 ; VI-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3050 ; VI-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3051 ; VI-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3052 ; VI-NEXT: v_max_f16_sdwa v12, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3053 ; VI-NEXT: v_max_f16_sdwa v13, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3054 ; VI-NEXT: v_max_f16_sdwa v14, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3055 ; VI-NEXT: v_max_f16_sdwa v15, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3056 ; VI-NEXT: v_max_f16_e32 v7, v7, v7
3057 ; VI-NEXT: v_max_f16_e32 v6, v6, v6
3058 ; VI-NEXT: v_max_f16_e32 v5, v5, v5
3059 ; VI-NEXT: v_max_f16_e32 v4, v4, v4
3060 ; VI-NEXT: v_max_f16_e32 v3, v3, v3
3061 ; VI-NEXT: v_max_f16_e32 v2, v2, v2
3062 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
3063 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
3064 ; VI-NEXT: v_or_b32_e32 v0, v0, v15
3065 ; VI-NEXT: v_or_b32_e32 v1, v1, v14
3066 ; VI-NEXT: v_or_b32_e32 v2, v2, v13
3067 ; VI-NEXT: v_or_b32_e32 v3, v3, v12
3068 ; VI-NEXT: v_or_b32_e32 v4, v4, v11
3069 ; VI-NEXT: v_or_b32_e32 v5, v5, v10
3070 ; VI-NEXT: v_or_b32_e32 v6, v6, v9
3071 ; VI-NEXT: v_or_b32_e32 v7, v7, v8
3072 ; VI-NEXT: s_setpc_b64 s[30:31]
3074 ; GFX9-LABEL: v_test_canonicalize_var_v16f16:
3076 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3077 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
3078 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
3079 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
3080 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v3
3081 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v4
3082 ; GFX9-NEXT: v_pk_max_f16 v5, v5, v5
3083 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v6
3084 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v7
3085 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3087 ; CI-LABEL: v_test_canonicalize_var_v16f16:
3089 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3090 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
3091 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
3092 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3093 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3094 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
3095 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
3096 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
3097 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
3098 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
3099 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
3100 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
3101 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3102 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
3103 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
3104 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
3105 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
3106 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
3107 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
3108 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3109 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
3110 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
3111 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
3112 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
3113 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
3114 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
3115 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
3116 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
3117 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
3118 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
3119 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
3120 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
3121 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
3122 ; CI-NEXT: s_setpc_b64 s[30:31]
3124 ; GFX11-LABEL: v_test_canonicalize_var_v16f16:
3126 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3127 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
3128 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
3129 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
3130 ; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
3131 ; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
3132 ; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
3133 ; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
3134 ; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
3135 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3136 %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val)
3137 ret <16 x half> %canonicalized
3140 define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
3141 ; VI-LABEL: v_test_canonicalize_var_v32f16:
3143 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3144 ; VI-NEXT: v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3145 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
3146 ; VI-NEXT: v_or_b32_e32 v0, v0, v20
3147 ; VI-NEXT: v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3148 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
3149 ; VI-NEXT: v_or_b32_e32 v1, v1, v20
3150 ; VI-NEXT: v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3151 ; VI-NEXT: v_max_f16_e32 v2, v2, v2
3152 ; VI-NEXT: v_or_b32_e32 v2, v2, v20
3153 ; VI-NEXT: v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3154 ; VI-NEXT: v_max_f16_e32 v3, v3, v3
3155 ; VI-NEXT: v_or_b32_e32 v3, v3, v20
3156 ; VI-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3157 ; VI-NEXT: v_max_f16_e32 v4, v4, v4
3158 ; VI-NEXT: v_or_b32_e32 v4, v4, v20
3159 ; VI-NEXT: v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3160 ; VI-NEXT: v_max_f16_e32 v5, v5, v5
3161 ; VI-NEXT: v_or_b32_e32 v5, v5, v20
3162 ; VI-NEXT: v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3163 ; VI-NEXT: v_max_f16_e32 v6, v6, v6
3164 ; VI-NEXT: v_or_b32_e32 v6, v6, v20
3165 ; VI-NEXT: v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3166 ; VI-NEXT: v_max_f16_e32 v7, v7, v7
3167 ; VI-NEXT: v_or_b32_e32 v7, v7, v20
3168 ; VI-NEXT: v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3169 ; VI-NEXT: v_max_f16_e32 v8, v8, v8
3170 ; VI-NEXT: v_or_b32_e32 v8, v8, v20
3171 ; VI-NEXT: v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3172 ; VI-NEXT: v_max_f16_e32 v9, v9, v9
3173 ; VI-NEXT: v_or_b32_e32 v9, v9, v20
3174 ; VI-NEXT: v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3175 ; VI-NEXT: v_max_f16_e32 v10, v10, v10
3176 ; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3177 ; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3178 ; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3179 ; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3180 ; VI-NEXT: v_or_b32_e32 v10, v10, v20
3181 ; VI-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3182 ; VI-NEXT: v_max_f16_e32 v15, v15, v15
3183 ; VI-NEXT: v_max_f16_e32 v14, v14, v14
3184 ; VI-NEXT: v_max_f16_e32 v13, v13, v13
3185 ; VI-NEXT: v_max_f16_e32 v12, v12, v12
3186 ; VI-NEXT: v_max_f16_e32 v11, v11, v11
3187 ; VI-NEXT: v_or_b32_e32 v11, v11, v20
3188 ; VI-NEXT: v_or_b32_e32 v12, v12, v19
3189 ; VI-NEXT: v_or_b32_e32 v13, v13, v18
3190 ; VI-NEXT: v_or_b32_e32 v14, v14, v17
3191 ; VI-NEXT: v_or_b32_e32 v15, v15, v16
3192 ; VI-NEXT: s_setpc_b64 s[30:31]
3194 ; GFX9-LABEL: v_test_canonicalize_var_v32f16:
3196 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3197 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
3198 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
3199 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
3200 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v3
3201 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v4
3202 ; GFX9-NEXT: v_pk_max_f16 v5, v5, v5
3203 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v6
3204 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v7
3205 ; GFX9-NEXT: v_pk_max_f16 v8, v8, v8
3206 ; GFX9-NEXT: v_pk_max_f16 v9, v9, v9
3207 ; GFX9-NEXT: v_pk_max_f16 v10, v10, v10
3208 ; GFX9-NEXT: v_pk_max_f16 v11, v11, v11
3209 ; GFX9-NEXT: v_pk_max_f16 v12, v12, v12
3210 ; GFX9-NEXT: v_pk_max_f16 v13, v13, v13
3211 ; GFX9-NEXT: v_pk_max_f16 v14, v14, v14
3212 ; GFX9-NEXT: v_pk_max_f16 v15, v15, v15
3213 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3215 ; CI-LABEL: v_test_canonicalize_var_v32f16:
3217 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3218 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
3219 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
3220 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
3221 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3222 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3223 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
3224 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
3225 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
3226 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
3227 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
3228 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
3229 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
3230 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3231 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
3232 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
3233 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
3234 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
3235 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
3236 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
3237 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
3238 ; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
3239 ; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
3240 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
3241 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
3242 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
3243 ; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
3244 ; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
3245 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
3246 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
3247 ; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
3248 ; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
3249 ; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
3250 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
3251 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
3252 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3253 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
3254 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
3255 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
3256 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
3257 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
3258 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
3259 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
3260 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
3261 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
3262 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
3263 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
3264 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
3265 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
3266 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
3267 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
3268 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
3269 ; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
3270 ; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
3271 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
3272 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
3273 ; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
3274 ; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
3275 ; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
3276 ; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
3277 ; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
3278 ; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
3279 ; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
3280 ; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
3281 ; CI-NEXT: s_waitcnt vmcnt(0)
3282 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
3283 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
3284 ; CI-NEXT: s_setpc_b64 s[30:31]
3286 ; GFX11-LABEL: v_test_canonicalize_var_v32f16:
3288 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3289 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
3290 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
3291 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
3292 ; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
3293 ; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
3294 ; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
3295 ; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
3296 ; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
3297 ; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
3298 ; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
3299 ; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
3300 ; GFX11-NEXT: v_pk_max_f16 v11, v11, v11
3301 ; GFX11-NEXT: v_pk_max_f16 v12, v12, v12
3302 ; GFX11-NEXT: v_pk_max_f16 v13, v13, v13
3303 ; GFX11-NEXT: v_pk_max_f16 v14, v14, v14
3304 ; GFX11-NEXT: v_pk_max_f16 v15, v15, v15
3305 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3306 %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val)
3307 ret <32 x half> %canonicalized
3310 define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
3311 ; VI-LABEL: v_test_canonicalize_var_v64f16:
3313 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3314 ; VI-NEXT: v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3315 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
3316 ; VI-NEXT: v_or_b32_e32 v0, v0, v31
3317 ; VI-NEXT: v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3318 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
3319 ; VI-NEXT: v_or_b32_e32 v1, v1, v31
3320 ; VI-NEXT: v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3321 ; VI-NEXT: v_max_f16_e32 v2, v2, v2
3322 ; VI-NEXT: v_or_b32_e32 v2, v2, v31
3323 ; VI-NEXT: v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3324 ; VI-NEXT: v_max_f16_e32 v3, v3, v3
3325 ; VI-NEXT: v_or_b32_e32 v3, v3, v31
3326 ; VI-NEXT: v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3327 ; VI-NEXT: v_max_f16_e32 v4, v4, v4
3328 ; VI-NEXT: v_or_b32_e32 v4, v4, v31
3329 ; VI-NEXT: v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3330 ; VI-NEXT: v_max_f16_e32 v5, v5, v5
3331 ; VI-NEXT: v_or_b32_e32 v5, v5, v31
3332 ; VI-NEXT: v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3333 ; VI-NEXT: v_max_f16_e32 v6, v6, v6
3334 ; VI-NEXT: v_or_b32_e32 v6, v6, v31
3335 ; VI-NEXT: v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3336 ; VI-NEXT: v_max_f16_e32 v7, v7, v7
3337 ; VI-NEXT: v_or_b32_e32 v7, v7, v31
3338 ; VI-NEXT: v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3339 ; VI-NEXT: v_max_f16_e32 v8, v8, v8
3340 ; VI-NEXT: v_or_b32_e32 v8, v8, v31
3341 ; VI-NEXT: v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3342 ; VI-NEXT: v_max_f16_e32 v9, v9, v9
3343 ; VI-NEXT: v_or_b32_e32 v9, v9, v31
3344 ; VI-NEXT: v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3345 ; VI-NEXT: v_max_f16_e32 v10, v10, v10
3346 ; VI-NEXT: v_or_b32_e32 v10, v10, v31
3347 ; VI-NEXT: v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3348 ; VI-NEXT: v_max_f16_e32 v11, v11, v11
3349 ; VI-NEXT: v_or_b32_e32 v11, v11, v31
3350 ; VI-NEXT: v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3351 ; VI-NEXT: v_max_f16_e32 v12, v12, v12
3352 ; VI-NEXT: v_or_b32_e32 v12, v12, v31
3353 ; VI-NEXT: v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3354 ; VI-NEXT: v_max_f16_e32 v13, v13, v13
3355 ; VI-NEXT: v_or_b32_e32 v13, v13, v31
3356 ; VI-NEXT: v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3357 ; VI-NEXT: v_max_f16_e32 v14, v14, v14
3358 ; VI-NEXT: v_or_b32_e32 v14, v14, v31
3359 ; VI-NEXT: v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3360 ; VI-NEXT: v_max_f16_e32 v15, v15, v15
3361 ; VI-NEXT: v_or_b32_e32 v15, v15, v31
3362 ; VI-NEXT: v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3363 ; VI-NEXT: v_max_f16_e32 v16, v16, v16
3364 ; VI-NEXT: v_or_b32_e32 v16, v16, v31
3365 ; VI-NEXT: v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3366 ; VI-NEXT: v_max_f16_e32 v17, v17, v17
3367 ; VI-NEXT: v_or_b32_e32 v17, v17, v31
3368 ; VI-NEXT: v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3369 ; VI-NEXT: v_max_f16_e32 v18, v18, v18
3370 ; VI-NEXT: v_or_b32_e32 v18, v18, v31
3371 ; VI-NEXT: v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3372 ; VI-NEXT: v_max_f16_e32 v19, v19, v19
3373 ; VI-NEXT: v_or_b32_e32 v19, v19, v31
3374 ; VI-NEXT: v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3375 ; VI-NEXT: v_max_f16_e32 v20, v20, v20
3376 ; VI-NEXT: v_or_b32_e32 v20, v20, v31
3377 ; VI-NEXT: v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3378 ; VI-NEXT: v_max_f16_e32 v21, v21, v21
3379 ; VI-NEXT: v_or_b32_e32 v21, v21, v31
3380 ; VI-NEXT: v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3381 ; VI-NEXT: v_max_f16_e32 v22, v22, v22
3382 ; VI-NEXT: v_or_b32_e32 v22, v22, v31
3383 ; VI-NEXT: v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3384 ; VI-NEXT: v_max_f16_e32 v23, v23, v23
3385 ; VI-NEXT: v_or_b32_e32 v23, v23, v31
3386 ; VI-NEXT: v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3387 ; VI-NEXT: v_max_f16_e32 v24, v24, v24
3388 ; VI-NEXT: v_or_b32_e32 v24, v24, v31
3389 ; VI-NEXT: v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3390 ; VI-NEXT: v_max_f16_e32 v25, v25, v25
3391 ; VI-NEXT: v_or_b32_e32 v25, v25, v31
3392 ; VI-NEXT: v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3393 ; VI-NEXT: v_max_f16_e32 v26, v26, v26
3394 ; VI-NEXT: v_or_b32_e32 v26, v26, v31
3395 ; VI-NEXT: v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3396 ; VI-NEXT: v_max_f16_e32 v27, v27, v27
3397 ; VI-NEXT: v_or_b32_e32 v27, v27, v31
3398 ; VI-NEXT: v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3399 ; VI-NEXT: v_max_f16_e32 v28, v28, v28
3400 ; VI-NEXT: v_or_b32_e32 v28, v28, v31
3401 ; VI-NEXT: v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3402 ; VI-NEXT: v_max_f16_e32 v29, v29, v29
3403 ; VI-NEXT: v_or_b32_e32 v29, v29, v31
3404 ; VI-NEXT: v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3405 ; VI-NEXT: v_max_f16_e32 v30, v30, v30
3406 ; VI-NEXT: v_or_b32_e32 v30, v30, v31
3407 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
3408 ; VI-NEXT: s_waitcnt vmcnt(0)
3409 ; VI-NEXT: v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3410 ; VI-NEXT: v_max_f16_e32 v31, v31, v31
3411 ; VI-NEXT: v_or_b32_e32 v31, v31, v32
3412 ; VI-NEXT: s_setpc_b64 s[30:31]
3414 ; GFX9-LABEL: v_test_canonicalize_var_v64f16:
3416 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3417 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
3418 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
3419 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
3420 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
3421 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v3
3422 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v4
3423 ; GFX9-NEXT: v_pk_max_f16 v5, v5, v5
3424 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v6
3425 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v7
3426 ; GFX9-NEXT: v_pk_max_f16 v8, v8, v8
3427 ; GFX9-NEXT: v_pk_max_f16 v9, v9, v9
3428 ; GFX9-NEXT: v_pk_max_f16 v10, v10, v10
3429 ; GFX9-NEXT: v_pk_max_f16 v11, v11, v11
3430 ; GFX9-NEXT: v_pk_max_f16 v12, v12, v12
3431 ; GFX9-NEXT: v_pk_max_f16 v13, v13, v13
3432 ; GFX9-NEXT: v_pk_max_f16 v14, v14, v14
3433 ; GFX9-NEXT: v_pk_max_f16 v15, v15, v15
3434 ; GFX9-NEXT: v_pk_max_f16 v16, v16, v16
3435 ; GFX9-NEXT: v_pk_max_f16 v17, v17, v17
3436 ; GFX9-NEXT: v_pk_max_f16 v18, v18, v18
3437 ; GFX9-NEXT: v_pk_max_f16 v19, v19, v19
3438 ; GFX9-NEXT: v_pk_max_f16 v20, v20, v20
3439 ; GFX9-NEXT: v_pk_max_f16 v21, v21, v21
3440 ; GFX9-NEXT: v_pk_max_f16 v22, v22, v22
3441 ; GFX9-NEXT: v_pk_max_f16 v23, v23, v23
3442 ; GFX9-NEXT: v_pk_max_f16 v24, v24, v24
3443 ; GFX9-NEXT: v_pk_max_f16 v25, v25, v25
3444 ; GFX9-NEXT: v_pk_max_f16 v26, v26, v26
3445 ; GFX9-NEXT: v_pk_max_f16 v27, v27, v27
3446 ; GFX9-NEXT: v_pk_max_f16 v28, v28, v28
3447 ; GFX9-NEXT: v_pk_max_f16 v29, v29, v29
3448 ; GFX9-NEXT: v_pk_max_f16 v30, v30, v30
3449 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3450 ; GFX9-NEXT: v_pk_max_f16 v31, v31, v31
3451 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3453 ; CI-LABEL: v_test_canonicalize_var_v64f16:
3455 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3456 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3457 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
3458 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3459 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3460 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
3461 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v4
3462 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v5
3463 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
3464 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v9
3465 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3466 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
3467 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
3468 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v10
3469 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v13
3470 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v18
3471 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3472 ; CI-NEXT: v_or_b32_e32 v3, v4, v3
3473 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8
3474 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v14
3475 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v21
3476 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v26
3477 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3478 ; CI-NEXT: v_or_b32_e32 v4, v5, v4
3479 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
3480 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12
3481 ; CI-NEXT: v_or_b32_e32 v5, v7, v5
3482 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v11
3483 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v17
3484 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
3485 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v22
3486 ; CI-NEXT: v_or_b32_e32 v6, v7, v6
3487 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
3488 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v16
3489 ; CI-NEXT: v_or_b32_e32 v7, v9, v7
3490 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v15
3491 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v25
3492 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
3493 ; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
3494 ; CI-NEXT: v_or_b32_e32 v8, v9, v8
3495 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
3496 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v20
3497 ; CI-NEXT: v_or_b32_e32 v9, v11, v9
3498 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v19
3499 ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
3500 ; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32
3501 ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
3502 ; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
3503 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
3504 ; CI-NEXT: v_or_b32_e32 v10, v11, v10
3505 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12
3506 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v24
3507 ; CI-NEXT: v_or_b32_e32 v11, v13, v11
3508 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v23
3509 ; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
3510 ; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16
3511 ; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
3512 ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24
3513 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
3514 ; CI-NEXT: v_cvt_f16_f32_e32 v24, v30
3515 ; CI-NEXT: v_or_b32_e32 v12, v13, v12
3516 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
3517 ; CI-NEXT: v_or_b32_e32 v13, v15, v13
3518 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
3519 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v27
3520 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36
3521 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
3522 ; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44
3523 ; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40
3524 ; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
3525 ; CI-NEXT: v_or_b32_e32 v14, v15, v14
3526 ; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24
3527 ; CI-NEXT: v_or_b32_e32 v15, v25, v15
3528 ; CI-NEXT: s_waitcnt vmcnt(11)
3529 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
3530 ; CI-NEXT: s_waitcnt vmcnt(10)
3531 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
3532 ; CI-NEXT: s_waitcnt vmcnt(9)
3533 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
3534 ; CI-NEXT: s_waitcnt vmcnt(8)
3535 ; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
3536 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
3537 ; CI-NEXT: v_or_b32_e32 v16, v17, v16
3538 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
3539 ; CI-NEXT: v_or_b32_e32 v17, v19, v17
3540 ; CI-NEXT: s_waitcnt vmcnt(7)
3541 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
3542 ; CI-NEXT: s_waitcnt vmcnt(6)
3543 ; CI-NEXT: v_cvt_f16_f32_e32 v19, v21
3544 ; CI-NEXT: s_waitcnt vmcnt(5)
3545 ; CI-NEXT: v_cvt_f16_f32_e32 v20, v22
3546 ; CI-NEXT: s_waitcnt vmcnt(4)
3547 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v23
3548 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
3549 ; CI-NEXT: v_or_b32_e32 v18, v19, v18
3550 ; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
3551 ; CI-NEXT: v_or_b32_e32 v19, v21, v19
3552 ; CI-NEXT: s_waitcnt vmcnt(3)
3553 ; CI-NEXT: v_cvt_f16_f32_e32 v20, v26
3554 ; CI-NEXT: s_waitcnt vmcnt(2)
3555 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v27
3556 ; CI-NEXT: s_waitcnt vmcnt(1)
3557 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
3558 ; CI-NEXT: s_waitcnt vmcnt(0)
3559 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
3560 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
3561 ; CI-NEXT: v_or_b32_e32 v20, v21, v20
3562 ; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
3563 ; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
3564 ; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48
3565 ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60
3566 ; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
3567 ; CI-NEXT: v_or_b32_e32 v21, v27, v21
3568 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
3569 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128
3570 ; CI-NEXT: s_waitcnt vmcnt(5)
3571 ; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
3572 ; CI-NEXT: s_waitcnt vmcnt(4)
3573 ; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
3574 ; CI-NEXT: s_waitcnt vmcnt(3)
3575 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
3576 ; CI-NEXT: s_waitcnt vmcnt(2)
3577 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
3578 ; CI-NEXT: s_waitcnt vmcnt(1)
3579 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
3580 ; CI-NEXT: s_waitcnt vmcnt(0)
3581 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
3582 ; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
3583 ; CI-NEXT: v_or_b32_e32 v24, v25, v24
3584 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
3585 ; CI-NEXT: v_or_b32_e32 v26, v27, v26
3586 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0
3587 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
3588 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124
3589 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120
3590 ; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
3591 ; CI-NEXT: v_or_b32_e32 v22, v22, v23
3592 ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88
3593 ; CI-NEXT: s_waitcnt vmcnt(2)
3594 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
3595 ; CI-NEXT: s_waitcnt vmcnt(1)
3596 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
3597 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
3598 ; CI-NEXT: v_or_b32_e32 v26, v27, v26
3599 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
3600 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
3601 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116
3602 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
3603 ; CI-NEXT: s_waitcnt vmcnt(3)
3604 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
3605 ; CI-NEXT: s_waitcnt vmcnt(1)
3606 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
3607 ; CI-NEXT: s_waitcnt vmcnt(0)
3608 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
3609 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
3610 ; CI-NEXT: v_or_b32_e32 v26, v27, v26
3611 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0
3612 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
3613 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
3614 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104
3615 ; CI-NEXT: s_waitcnt vmcnt(1)
3616 ; CI-NEXT: v_cvt_f16_f32_e32 v25, v26
3617 ; CI-NEXT: s_waitcnt vmcnt(0)
3618 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
3619 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92
3620 ; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
3621 ; CI-NEXT: v_or_b32_e32 v25, v26, v25
3622 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0
3623 ; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
3624 ; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100
3625 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96
3626 ; CI-NEXT: s_waitcnt vmcnt(3)
3627 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
3628 ; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
3629 ; CI-NEXT: v_or_b32_e32 v23, v23, v27
3630 ; CI-NEXT: s_waitcnt vmcnt(1)
3631 ; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
3632 ; CI-NEXT: s_waitcnt vmcnt(0)
3633 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
3634 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
3635 ; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
3636 ; CI-NEXT: v_or_b32_e32 v25, v26, v25
3637 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0
3638 ; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
3639 ; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
3640 ; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64
3641 ; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
3642 ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76
3643 ; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72
3644 ; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84
3645 ; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80
3646 ; CI-NEXT: s_waitcnt vmcnt(3)
3647 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
3648 ; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
3649 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
3650 ; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
3651 ; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
3652 ; CI-NEXT: v_or_b32_e32 v25, v26, v25
3653 ; CI-NEXT: s_waitcnt vmcnt(2)
3654 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
3655 ; CI-NEXT: s_waitcnt vmcnt(0)
3656 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
3657 ; CI-NEXT: v_or_b32_e32 v23, v26, v23
3658 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
3659 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
3660 ; CI-NEXT: v_or_b32_e32 v26, v27, v26
3661 ; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0
3662 ; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
3663 ; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0
3664 ; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen
3665 ; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0
3666 ; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen
3667 ; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0
3668 ; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen
3669 ; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0
3670 ; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen
3671 ; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0
3672 ; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
3673 ; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0
3674 ; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
3675 ; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0
3676 ; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
3677 ; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0
3678 ; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
3679 ; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0
3680 ; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
3681 ; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0
3682 ; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
3683 ; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0
3684 ; CI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
3685 ; CI-NEXT: v_add_i32_e32 v15, vcc, 52, v0
3686 ; CI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
3687 ; CI-NEXT: v_add_i32_e32 v14, vcc, 48, v0
3688 ; CI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
3689 ; CI-NEXT: v_add_i32_e32 v13, vcc, 44, v0
3690 ; CI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
3691 ; CI-NEXT: v_add_i32_e32 v12, vcc, 40, v0
3692 ; CI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
3693 ; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0
3694 ; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
3695 ; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0
3696 ; CI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
3697 ; CI-NEXT: v_add_i32_e32 v9, vcc, 28, v0
3698 ; CI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen
3699 ; CI-NEXT: v_add_i32_e32 v8, vcc, 24, v0
3700 ; CI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
3701 ; CI-NEXT: v_add_i32_e32 v7, vcc, 20, v0
3702 ; CI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
3703 ; CI-NEXT: v_add_i32_e32 v6, vcc, 16, v0
3704 ; CI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
3705 ; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v0
3706 ; CI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
3707 ; CI-NEXT: v_add_i32_e32 v4, vcc, 8, v0
3708 ; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
3709 ; CI-NEXT: v_add_i32_e32 v3, vcc, 4, v0
3710 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
3711 ; CI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3712 ; CI-NEXT: s_waitcnt vmcnt(0)
3713 ; CI-NEXT: s_setpc_b64 s[30:31]
3715 ; GFX11-LABEL: v_test_canonicalize_var_v64f16:
3717 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3718 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
3719 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
3720 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
3721 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
3722 ; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
3723 ; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
3724 ; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
3725 ; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
3726 ; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
3727 ; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
3728 ; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
3729 ; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
3730 ; GFX11-NEXT: v_pk_max_f16 v11, v11, v11
3731 ; GFX11-NEXT: v_pk_max_f16 v12, v12, v12
3732 ; GFX11-NEXT: v_pk_max_f16 v13, v13, v13
3733 ; GFX11-NEXT: v_pk_max_f16 v14, v14, v14
3734 ; GFX11-NEXT: v_pk_max_f16 v15, v15, v15
3735 ; GFX11-NEXT: v_pk_max_f16 v16, v16, v16
3736 ; GFX11-NEXT: v_pk_max_f16 v17, v17, v17
3737 ; GFX11-NEXT: v_pk_max_f16 v18, v18, v18
3738 ; GFX11-NEXT: v_pk_max_f16 v19, v19, v19
3739 ; GFX11-NEXT: v_pk_max_f16 v20, v20, v20
3740 ; GFX11-NEXT: v_pk_max_f16 v21, v21, v21
3741 ; GFX11-NEXT: v_pk_max_f16 v22, v22, v22
3742 ; GFX11-NEXT: v_pk_max_f16 v23, v23, v23
3743 ; GFX11-NEXT: v_pk_max_f16 v24, v24, v24
3744 ; GFX11-NEXT: v_pk_max_f16 v25, v25, v25
3745 ; GFX11-NEXT: v_pk_max_f16 v26, v26, v26
3746 ; GFX11-NEXT: v_pk_max_f16 v27, v27, v27
3747 ; GFX11-NEXT: v_pk_max_f16 v28, v28, v28
3748 ; GFX11-NEXT: v_pk_max_f16 v29, v29, v29
3749 ; GFX11-NEXT: v_pk_max_f16 v30, v30, v30
3750 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3751 ; GFX11-NEXT: v_pk_max_f16 v31, v31, v31
3752 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3753 %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val)
3754 ret <64 x half> %canonicalized
3757 attributes #0 = { nounwind readnone }
3758 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3759 attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
3760 attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }