1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
3 ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
5 declare half @llvm.fabs.f16(half) #0
6 declare half @llvm.canonicalize.f16(half) #0
7 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
8 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
9 declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
10 declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
11 declare i32 @llvm.amdgcn.workitem.id.x() #0
13 ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
14 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
15 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
16 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
17 %canonicalized = call half @llvm.canonicalize.f16(half undef)
18 store half %canonicalized, half addrspace(1)* %out
22 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
23 ; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
24 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
26 ; CI: v_cvt_f32_f16_e32
27 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
28 define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
29 %val = load half, half addrspace(1)* %out
30 %canonicalized = call half @llvm.canonicalize.f16(half %val)
31 store half %canonicalized, half addrspace(1)* undef
35 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
36 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
37 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
38 define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
39 %val = bitcast i16 %val.arg to half
40 %canonicalized = call half @llvm.canonicalize.f16(half %val)
41 store half %canonicalized, half addrspace(1)* %out
45 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
46 ; GFX9: v_and_b32_e32 v0, 0xffff, v0
47 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
48 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
50 ; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
51 ; VI: v_max_f16_e32 v0, v0, v0
52 ; VI: v_or_b32_e32 v0, v0, v1
53 define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
54 %ins0 = insertelement <2 x half> undef, half %lo, i32 0
55 %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
56 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
57 ret <2 x half> %canonicalized
60 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
61 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
62 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
63 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
64 %val = load half, half addrspace(1)* %out
65 %val.fabs = call half @llvm.fabs.f16(half %val)
66 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
67 store half %canonicalized, half addrspace(1)* %out
71 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
72 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
73 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
75 ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
76 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
77 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
78 %val = load half, half addrspace(1)* %out
79 %val.fabs = call half @llvm.fabs.f16(half %val)
80 %val.fabs.fneg = fsub half -0.0, %val.fabs
81 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
82 store half %canonicalized, half addrspace(1)* %out
86 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
87 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
88 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
90 ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}}
91 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
92 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
93 %val = load half, half addrspace(1)* %out
94 %val.fneg = fsub half -0.0, %val
95 %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
96 store half %canonicalized, half addrspace(1)* %out
100 ; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16:
101 ; GFX89: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}}
102 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
103 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 {
104 %val = load half, half addrspace(1)* %out
105 %val.fneg = fsub half -0.0, %val
106 %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
107 store half %canonicalized, half addrspace(1)* %out
111 ; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
112 ; GFX89: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}|
113 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
115 ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|
116 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
117 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #2 {
118 %val = load half, half addrspace(1)* %out
119 %val.fabs = call half @llvm.fabs.f16(half %val)
120 %val.fabs.fneg = fsub half -0.0, %val.fabs
121 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
122 store half %canonicalized, half addrspace(1)* %out
126 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
127 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
128 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
129 define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
130 %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
131 store half %canonicalized, half addrspace(1)* %out
135 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
136 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
137 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
138 define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
139 %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
140 store half %canonicalized, half addrspace(1)* %out
144 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
145 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
146 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
147 define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
148 %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
149 store half %canonicalized, half addrspace(1)* %out
153 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
154 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
155 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
156 define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
157 %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
158 store half %canonicalized, half addrspace(1)* %out
162 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
163 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
164 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
165 define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
166 %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
167 store half %canonicalized, half addrspace(1)* %out
171 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
172 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
173 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
174 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
175 %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
176 store half %canonicalized, half addrspace(1)* %out
180 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
181 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
182 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
183 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
184 %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
185 store half %canonicalized, half addrspace(1)* %out
189 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
190 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
191 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
192 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
193 %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
194 store half %canonicalized, half addrspace(1)* %out
198 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
199 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
200 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
201 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
202 %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
203 store half %canonicalized, half addrspace(1)* %out
207 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
208 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
209 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
210 define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
211 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
212 store half %canonicalized, half addrspace(1)* %out
216 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
217 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
218 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
219 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
220 %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
221 store half %canonicalized, half addrspace(1)* %out
225 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
226 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
227 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
228 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
229 %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
230 store half %canonicalized, half addrspace(1)* %out
234 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
235 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
236 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
237 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
238 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
239 store half %canonicalized, half addrspace(1)* %out
243 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
244 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
245 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
246 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
247 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
248 store half %canonicalized, half addrspace(1)* %out
252 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
253 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
254 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
255 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
256 %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
257 store half %canonicalized, half addrspace(1)* %out
261 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
262 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
263 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
264 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
265 %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
266 store half %canonicalized, half addrspace(1)* %out
270 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
271 ; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
272 ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
275 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}}
276 ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
277 define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
278 %tid = call i32 @llvm.amdgcn.workitem.id.x()
279 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
280 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
281 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
282 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
286 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
287 ; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
288 ; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}|
292 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
293 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]]{{$}}
294 ; GFX89: {{flat|global}}_store_dword
295 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
296 %tid = call i32 @llvm.amdgcn.workitem.id.x()
297 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
298 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
299 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
300 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
301 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
305 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
306 ; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
307 ; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
310 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
311 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1]{{$}}
312 ; GFX89: {{flat|global}}_store_dword
316 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0
317 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0
318 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
319 %tid = call i32 @llvm.amdgcn.workitem.id.x()
320 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
321 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
322 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
323 %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
324 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
325 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
329 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
330 ; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
331 ; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
334 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
335 ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
336 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
337 %tid = call i32 @llvm.amdgcn.workitem.id.x()
338 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
339 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
340 %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
341 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
342 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
346 ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
347 ; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
348 ; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
351 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}}
352 ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
353 define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
354 %val = bitcast i32 %val.arg to <2 x half>
355 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
356 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
360 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
361 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
362 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
363 define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
364 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
365 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
369 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
370 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
371 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
372 define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
373 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
374 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
378 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
379 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
380 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
381 define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
382 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
383 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
387 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
388 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
389 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
390 define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
391 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
392 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
396 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
397 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
398 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
399 define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
400 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
401 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
405 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
406 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
407 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
408 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
409 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
410 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
414 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
415 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
416 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
417 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
418 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
419 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
423 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
424 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
425 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
426 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
427 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
428 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
432 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
433 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
434 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
435 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
436 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
437 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
441 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
442 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
443 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
444 define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
445 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
446 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
450 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
451 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
452 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
453 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
454 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
455 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
459 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
460 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
461 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
462 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
463 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
464 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
468 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
469 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
470 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
471 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
472 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
473 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
477 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
478 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
479 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
480 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
481 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
482 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
486 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
487 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
488 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
489 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
490 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
491 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
495 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
496 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
497 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
498 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
499 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
500 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
504 ; FIXME: Extra 4th component handled
505 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v3f16:
507 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
508 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
509 ; GFX9-NEXT: s_setpc_b64
511 ; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
512 ; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
513 ; VI-DAG: v_max_f16_e32 v1, v1, v1
514 ; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
517 define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
518 %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
519 ret <3 x half> %canonicalized
522 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v4f16:
524 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
525 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
526 ; GFX9-NEXT: s_setpc_b64
528 ; VI-DAG: v_max_f16_sdwa [[CANON_ELT3:v[0-9]+]], v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
529 ; VI-DAG: v_max_f16_e32 [[CANON_ELT2:v[0-9]+]], v1, v1
530 ; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
531 ; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
532 ; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
533 ; VI-DAG: v_or_b32_e32 v1, [[CANON_ELT2]], [[CANON_ELT3]]
535 define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
536 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
537 ret <4 x half> %canonicalized
540 ; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
541 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
542 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
543 define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
544 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
545 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
549 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
551 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
552 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
553 ; GFX9-NEXT: s_setpc_b64
555 ; High bits known zero
556 ; FIXME: Should also be true on gfx9 by default?
558 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
559 ; VI-NEXT: s_setpc_b64
560 define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
561 %vec = insertelement <2 x half> undef, half %val, i32 0
562 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
563 ret <2 x half> %canonicalized
566 ; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
568 ; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
569 ; GFX89-NEXT: s_setpc_b64
570 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
571 %vec = insertelement <2 x half> undef, half %val, i32 1
572 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
573 ret <2 x half> %canonicalized
576 ; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
578 ; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
579 ; GFX89-NEXT: s_setpc_b64
581 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
582 ; CI-NEXT: v_mov_b32_e32 v1, 1.0
583 ; CI-NEXT: s_setpc_b64
584 define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
585 %vec = insertelement <2 x half> undef, half 1.0, i32 1
586 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
587 ret <2 x half> %canonicalized
590 ; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
592 ; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
593 ; GFX89-NEXT: s_setpc_b64
595 ; CI-NEXT: v_mov_b32_e32 v0, 1.0
596 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
597 ; CI-NEXT: s_setpc_b64
598 define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
599 %vec = insertelement <2 x half> undef, half 1.0, i32 0
600 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
601 ret <2 x half> %canonicalized
604 ; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
606 ; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
607 ; GFX89-NEXT: s_setpc_b64
609 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
610 ; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
611 ; CI-NEXT: s_setpc_b64
612 define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
613 %vec = insertelement <2 x half> undef, half 16.0, i32 1
614 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
615 ret <2 x half> %canonicalized
618 ; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
620 ; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
621 ; GFX89-NEXT: s_setpc_b64
623 ; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
624 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
625 ; CI-NEXT: s_setpc_b64
626 define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
627 %vec = insertelement <2 x half> undef, half 16.0, i32 0
628 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
629 ret <2 x half> %canonicalized
632 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
634 ; GFX9-DAG: v_max_f16_e32 v0, v0, v0
635 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000
636 ; GFX9: v_and_b32_e32 v0, 0xffff, v0
637 ; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
641 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
642 ; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
643 ; VI-NEXT: s_setpc_b64
644 define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
645 %vec0 = insertelement <2 x half> undef, half %val, i32 0
646 %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
647 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
648 ret <2 x half> %canonicalized
651 ; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
652 ; GFX9: v_max_f16_e32 v0, v0, v0
653 ; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000
654 ; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
658 ; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
659 ; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
660 ; VI-NEXT: s_setpc_b64
661 define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
662 %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
663 %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
664 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
665 ret <2 x half> %canonicalized
668 ; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16:
669 ; GCN: v_mov_b32_e32 v0, 0x7e007e00
670 ; GCN: v_mov_b32_e32 v1, v0
671 define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 {
672 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
673 store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out
677 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
679 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
680 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
681 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
682 ; GFX9-NEXT: s_setpc_b64
685 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
686 ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
687 ; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
688 ; VI-NEXT: s_setpc_b64
689 define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
690 %vec = insertelement <4 x half> undef, half %val, i32 0
691 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
692 ret <4 x half> %canonicalized
695 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16:
697 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
698 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
699 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
700 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
701 ; GFX9-NEXT: s_setpc_b64
704 ; VI-DAG: v_max_f16_e32 v0, v0, v0
705 ; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
706 ; VI: v_or_b32_e32 v0, v0, v1
707 ; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
708 ; VI-NEXT: s_setpc_b64
709 define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
710 %vec0 = insertelement <4 x half> undef, half %val0, i32 0
711 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
712 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
713 ret <4 x half> %canonicalized
716 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
718 ; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
719 ; GFX9-NEXT: v_and_b32_e32 v1, [[MASK]], v1
720 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
721 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
722 ; GFX9-NEXT: v_and_b32_e32 v0, [[MASK]], v0
723 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
724 ; GFX9-NEXT: s_setpc_b64
727 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
728 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
729 ; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
730 ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
731 ; VI-NEXT: v_or_b32_e32 v1, v1, v2
732 ; VI-NEXT: s_setpc_b64
733 define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
734 %vec0 = insertelement <4 x half> undef, half %val0, i32 0
735 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
736 %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
737 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
738 ret <4 x half> %canonicalized
741 attributes #0 = { nounwind readnone }
742 attributes #1 = { nounwind }
743 attributes #2 = { nounwind "target-features"="-fp64-fp16-denormals" }
744 attributes #3 = { nounwind "target-features"="+fp64-fp16-denormals" }