1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
8 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
9 ; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
11 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: s_mov_b32 s6, -1
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s4, s0
16 ; SI-NEXT: s_mov_b32 s5, s1
17 ; SI-NEXT: v_mov_b32_e32 v0, s3
18 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
19 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
22 ; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s3
27 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0
28 ; VI-NEXT: v_mov_b32_e32 v0, s0
29 ; VI-NEXT: v_mov_b32_e32 v1, s1
30 ; VI-NEXT: flat_store_dword v[0:1], v2
33 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
35 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
36 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
37 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
38 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
39 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1
40 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
43 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
45 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
46 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
47 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
49 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
50 ; GFX10-NEXT: s_endpgm
52 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
54 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
55 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
56 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3
58 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
60 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
61 ; GFX11-NEXT: s_endpgm
62 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
63 store <2 x half> %result, ptr addrspace(1) %out
67 define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 {
68 ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
70 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
71 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
72 ; SI-NEXT: s_mov_b32 s3, 0xf000
73 ; SI-NEXT: s_mov_b32 s2, -1
74 ; SI-NEXT: s_waitcnt lgkmcnt(0)
75 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4
76 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
79 ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
81 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
83 ; VI-NEXT: s_waitcnt lgkmcnt(0)
84 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2
85 ; VI-NEXT: v_mov_b32_e32 v0, s0
86 ; VI-NEXT: v_mov_b32_e32 v1, s1
87 ; VI-NEXT: flat_store_dword v[0:1], v2
90 ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
92 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
93 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
94 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
95 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4
97 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
100 ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
102 ; GFX10-NEXT: s_clause 0x1
103 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
104 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
105 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
106 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4
108 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
109 ; GFX10-NEXT: s_endpgm
111 ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
113 ; GFX11-NEXT: s_clause 0x1
114 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
115 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
116 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2
119 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
120 ; GFX11-NEXT: s_nop 0
121 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
122 ; GFX11-NEXT: s_endpgm
123 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
124 store <2 x half> %result, ptr addrspace(1) %out
128 define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 {
129 ; GCN-LABEL: s_cvt_pkrtz_undef_undef:
133 ; GFX10-LABEL: s_cvt_pkrtz_undef_undef:
135 ; GFX10-NEXT: s_endpgm
137 ; GFX11-LABEL: s_cvt_pkrtz_undef_undef:
139 ; GFX11-NEXT: s_endpgm
140 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
141 store <2 x half> %result, ptr addrspace(1) %out
145 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
146 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
148 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
149 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
150 ; SI-NEXT: s_mov_b32 s11, 0xf000
151 ; SI-NEXT: s_mov_b32 s10, 0
152 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
153 ; SI-NEXT: v_mov_b32_e32 v1, 0
154 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
155 ; SI-NEXT: s_waitcnt lgkmcnt(0)
156 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
157 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
158 ; SI-NEXT: s_waitcnt vmcnt(0)
159 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
160 ; SI-NEXT: s_waitcnt vmcnt(0)
161 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
162 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
163 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
166 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
168 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
169 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
170 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
171 ; VI-NEXT: s_waitcnt lgkmcnt(0)
172 ; VI-NEXT: v_mov_b32_e32 v1, s7
173 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
174 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
175 ; VI-NEXT: v_mov_b32_e32 v3, s1
176 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
177 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
178 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
179 ; VI-NEXT: s_waitcnt vmcnt(0)
180 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
181 ; VI-NEXT: s_waitcnt vmcnt(0)
182 ; VI-NEXT: v_mov_b32_e32 v1, s5
183 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
184 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
185 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
186 ; VI-NEXT: flat_store_dword v[0:1], v2
189 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
191 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
192 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
193 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
194 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
195 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
196 ; GFX9-NEXT: s_waitcnt vmcnt(0)
197 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
198 ; GFX9-NEXT: s_waitcnt vmcnt(0)
199 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2
200 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
201 ; GFX9-NEXT: s_endpgm
203 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32:
205 ; GFX10-NEXT: s_clause 0x1
206 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
207 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
208 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
209 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
211 ; GFX10-NEXT: s_waitcnt vmcnt(0)
212 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
213 ; GFX10-NEXT: s_waitcnt vmcnt(0)
214 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2
215 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
216 ; GFX10-NEXT: s_endpgm
218 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32:
220 ; GFX11-NEXT: s_clause 0x1
221 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
222 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
223 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
224 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
225 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
226 ; GFX11-NEXT: s_waitcnt vmcnt(0)
227 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
228 ; GFX11-NEXT: s_waitcnt vmcnt(0)
229 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2
230 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
231 ; GFX11-NEXT: s_nop 0
232 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
233 ; GFX11-NEXT: s_endpgm
234 %tid = call i32 @llvm.amdgcn.workitem.id.x()
235 %tid.ext = sext i32 %tid to i64
236 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
237 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
238 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
239 %a = load volatile float, ptr addrspace(1) %a.gep
240 %b = load volatile float, ptr addrspace(1) %b.gep
241 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
242 store <2 x half> %cvt, ptr addrspace(1) %out.gep
246 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
247 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
249 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
250 ; SI-NEXT: s_mov_b32 s7, 0xf000
251 ; SI-NEXT: s_mov_b32 s6, 0
252 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
253 ; SI-NEXT: v_mov_b32_e32 v1, 0
254 ; SI-NEXT: s_waitcnt lgkmcnt(0)
255 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
256 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
257 ; SI-NEXT: s_waitcnt vmcnt(0)
258 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
259 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
260 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
263 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
265 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
266 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
267 ; VI-NEXT: s_waitcnt lgkmcnt(0)
268 ; VI-NEXT: v_mov_b32_e32 v1, s3
269 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
270 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
271 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
272 ; VI-NEXT: s_waitcnt vmcnt(0)
273 ; VI-NEXT: v_mov_b32_e32 v1, s1
274 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
275 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
276 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
277 ; VI-NEXT: flat_store_dword v[0:1], v2
280 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
282 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
283 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
286 ; GFX9-NEXT: s_waitcnt vmcnt(0)
287 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
288 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
289 ; GFX9-NEXT: s_endpgm
291 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
293 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
294 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
295 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
296 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
297 ; GFX10-NEXT: s_waitcnt vmcnt(0)
298 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
299 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
300 ; GFX10-NEXT: s_endpgm
302 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
304 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
305 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
306 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
307 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
308 ; GFX11-NEXT: s_waitcnt vmcnt(0)
309 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0
310 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
311 ; GFX11-NEXT: s_nop 0
312 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
313 ; GFX11-NEXT: s_endpgm
314 %tid = call i32 @llvm.amdgcn.workitem.id.x()
315 %tid.ext = sext i32 %tid to i64
316 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
317 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
318 %a = load volatile float, ptr addrspace(1) %a.gep
319 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
320 store <2 x half> %cvt, ptr addrspace(1) %out.gep
324 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
325 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
327 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
328 ; SI-NEXT: s_mov_b32 s7, 0xf000
329 ; SI-NEXT: s_mov_b32 s6, 0
330 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
331 ; SI-NEXT: v_mov_b32_e32 v1, 0
332 ; SI-NEXT: s_waitcnt lgkmcnt(0)
333 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
334 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
335 ; SI-NEXT: s_waitcnt vmcnt(0)
336 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
337 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
338 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
341 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
343 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
344 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
345 ; VI-NEXT: s_waitcnt lgkmcnt(0)
346 ; VI-NEXT: v_mov_b32_e32 v1, s3
347 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
348 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
349 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
350 ; VI-NEXT: s_waitcnt vmcnt(0)
351 ; VI-NEXT: v_mov_b32_e32 v1, s1
352 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
353 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
354 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
355 ; VI-NEXT: flat_store_dword v[0:1], v2
358 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
360 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
361 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
362 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
365 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
366 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
367 ; GFX9-NEXT: s_endpgm
369 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
371 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
372 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
375 ; GFX10-NEXT: s_waitcnt vmcnt(0)
376 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
377 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
378 ; GFX10-NEXT: s_endpgm
380 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
382 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
383 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
384 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
386 ; GFX11-NEXT: s_waitcnt vmcnt(0)
387 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1
388 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
389 ; GFX11-NEXT: s_nop 0
390 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
391 ; GFX11-NEXT: s_endpgm
392 %tid = call i32 @llvm.amdgcn.workitem.id.x()
393 %tid.ext = sext i32 %tid to i64
394 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
395 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
396 %a = load volatile float, ptr addrspace(1) %a.gep
397 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
398 store <2 x half> %cvt, ptr addrspace(1) %out.gep
402 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
403 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
405 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
406 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
407 ; SI-NEXT: s_mov_b32 s11, 0xf000
408 ; SI-NEXT: s_mov_b32 s10, 0
409 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
410 ; SI-NEXT: v_mov_b32_e32 v1, 0
411 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
412 ; SI-NEXT: s_waitcnt lgkmcnt(0)
413 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
414 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
415 ; SI-NEXT: s_waitcnt vmcnt(0)
416 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
417 ; SI-NEXT: s_waitcnt vmcnt(0)
418 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
419 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
420 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
423 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
425 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
426 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
427 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
428 ; VI-NEXT: s_waitcnt lgkmcnt(0)
429 ; VI-NEXT: v_mov_b32_e32 v1, s7
430 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
431 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
432 ; VI-NEXT: v_mov_b32_e32 v3, s1
433 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
434 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
435 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
436 ; VI-NEXT: s_waitcnt vmcnt(0)
437 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
438 ; VI-NEXT: s_waitcnt vmcnt(0)
439 ; VI-NEXT: v_mov_b32_e32 v1, s5
440 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
441 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
442 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
443 ; VI-NEXT: flat_store_dword v[0:1], v2
446 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
448 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
449 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
450 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
451 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
453 ; GFX9-NEXT: s_waitcnt vmcnt(0)
454 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
455 ; GFX9-NEXT: s_waitcnt vmcnt(0)
456 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2
457 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
458 ; GFX9-NEXT: s_endpgm
460 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
462 ; GFX10-NEXT: s_clause 0x1
463 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
464 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
465 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
466 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
468 ; GFX10-NEXT: s_waitcnt vmcnt(0)
469 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
470 ; GFX10-NEXT: s_waitcnt vmcnt(0)
471 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2
472 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
473 ; GFX10-NEXT: s_endpgm
475 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
477 ; GFX11-NEXT: s_clause 0x1
478 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
479 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
480 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
481 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
482 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
483 ; GFX11-NEXT: s_waitcnt vmcnt(0)
484 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
485 ; GFX11-NEXT: s_waitcnt vmcnt(0)
486 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2
487 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
488 ; GFX11-NEXT: s_nop 0
489 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
490 ; GFX11-NEXT: s_endpgm
491 %tid = call i32 @llvm.amdgcn.workitem.id.x()
492 %tid.ext = sext i32 %tid to i64
493 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
494 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
495 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
496 %a = load volatile float, ptr addrspace(1) %a.gep
497 %b = load volatile float, ptr addrspace(1) %b.gep
498 %neg.a = fsub float -0.0, %a
499 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
500 store <2 x half> %cvt, ptr addrspace(1) %out.gep
504 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
505 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
507 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
508 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
509 ; SI-NEXT: s_mov_b32 s11, 0xf000
510 ; SI-NEXT: s_mov_b32 s10, 0
511 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
512 ; SI-NEXT: v_mov_b32_e32 v1, 0
513 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
514 ; SI-NEXT: s_waitcnt lgkmcnt(0)
515 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
516 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
517 ; SI-NEXT: s_waitcnt vmcnt(0)
518 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
519 ; SI-NEXT: s_waitcnt vmcnt(0)
520 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
521 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
522 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
525 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
527 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
528 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
529 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
530 ; VI-NEXT: s_waitcnt lgkmcnt(0)
531 ; VI-NEXT: v_mov_b32_e32 v1, s7
532 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
533 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
534 ; VI-NEXT: v_mov_b32_e32 v3, s1
535 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
536 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
537 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
538 ; VI-NEXT: s_waitcnt vmcnt(0)
539 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
540 ; VI-NEXT: s_waitcnt vmcnt(0)
541 ; VI-NEXT: v_mov_b32_e32 v1, s5
542 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
543 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
544 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
545 ; VI-NEXT: flat_store_dword v[0:1], v2
548 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
550 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
551 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
552 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
553 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
554 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
555 ; GFX9-NEXT: s_waitcnt vmcnt(0)
556 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
557 ; GFX9-NEXT: s_waitcnt vmcnt(0)
558 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2
559 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
560 ; GFX9-NEXT: s_endpgm
562 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
564 ; GFX10-NEXT: s_clause 0x1
565 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
566 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
567 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
568 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
570 ; GFX10-NEXT: s_waitcnt vmcnt(0)
571 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
572 ; GFX10-NEXT: s_waitcnt vmcnt(0)
573 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2
574 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
575 ; GFX10-NEXT: s_endpgm
577 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
579 ; GFX11-NEXT: s_clause 0x1
580 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
581 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
582 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
583 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
585 ; GFX11-NEXT: s_waitcnt vmcnt(0)
586 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
587 ; GFX11-NEXT: s_waitcnt vmcnt(0)
588 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2
589 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
590 ; GFX11-NEXT: s_nop 0
591 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
592 ; GFX11-NEXT: s_endpgm
593 %tid = call i32 @llvm.amdgcn.workitem.id.x()
594 %tid.ext = sext i32 %tid to i64
595 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
596 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
597 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
598 %a = load volatile float, ptr addrspace(1) %a.gep
599 %b = load volatile float, ptr addrspace(1) %b.gep
600 %neg.b = fsub float -0.0, %b
601 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
602 store <2 x half> %cvt, ptr addrspace(1) %out.gep
606 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
607 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
609 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
610 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
611 ; SI-NEXT: s_mov_b32 s11, 0xf000
612 ; SI-NEXT: s_mov_b32 s10, 0
613 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
614 ; SI-NEXT: v_mov_b32_e32 v1, 0
615 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
616 ; SI-NEXT: s_waitcnt lgkmcnt(0)
617 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
618 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
619 ; SI-NEXT: s_waitcnt vmcnt(0)
620 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
621 ; SI-NEXT: s_waitcnt vmcnt(0)
622 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
623 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
624 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
627 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
629 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
630 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
631 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
632 ; VI-NEXT: s_waitcnt lgkmcnt(0)
633 ; VI-NEXT: v_mov_b32_e32 v1, s7
634 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
635 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
636 ; VI-NEXT: v_mov_b32_e32 v3, s1
637 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
638 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
639 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
640 ; VI-NEXT: s_waitcnt vmcnt(0)
641 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
642 ; VI-NEXT: s_waitcnt vmcnt(0)
643 ; VI-NEXT: v_mov_b32_e32 v1, s5
644 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
645 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
646 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
647 ; VI-NEXT: flat_store_dword v[0:1], v2
650 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
652 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
653 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
654 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
655 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
656 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
657 ; GFX9-NEXT: s_waitcnt vmcnt(0)
658 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
659 ; GFX9-NEXT: s_waitcnt vmcnt(0)
660 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2
661 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
662 ; GFX9-NEXT: s_endpgm
664 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
666 ; GFX10-NEXT: s_clause 0x1
667 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
668 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
669 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
670 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
671 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
672 ; GFX10-NEXT: s_waitcnt vmcnt(0)
673 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
674 ; GFX10-NEXT: s_waitcnt vmcnt(0)
675 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2
676 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
677 ; GFX10-NEXT: s_endpgm
679 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
681 ; GFX11-NEXT: s_clause 0x1
682 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
683 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
684 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
685 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
687 ; GFX11-NEXT: s_waitcnt vmcnt(0)
688 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
689 ; GFX11-NEXT: s_waitcnt vmcnt(0)
690 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2
691 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
692 ; GFX11-NEXT: s_nop 0
693 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
694 ; GFX11-NEXT: s_endpgm
695 %tid = call i32 @llvm.amdgcn.workitem.id.x()
696 %tid.ext = sext i32 %tid to i64
697 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
698 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
699 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
700 %a = load volatile float, ptr addrspace(1) %a.gep
701 %b = load volatile float, ptr addrspace(1) %b.gep
702 %neg.a = fsub float -0.0, %a
703 %neg.b = fsub float -0.0, %b
704 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
705 store <2 x half> %cvt, ptr addrspace(1) %out.gep
709 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
710 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
712 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
713 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
714 ; SI-NEXT: s_mov_b32 s11, 0xf000
715 ; SI-NEXT: s_mov_b32 s10, 0
716 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
717 ; SI-NEXT: v_mov_b32_e32 v1, 0
718 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
719 ; SI-NEXT: s_waitcnt lgkmcnt(0)
720 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
721 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
722 ; SI-NEXT: s_waitcnt vmcnt(0)
723 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
724 ; SI-NEXT: s_waitcnt vmcnt(0)
725 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
726 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
727 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
730 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
732 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
733 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
734 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
735 ; VI-NEXT: s_waitcnt lgkmcnt(0)
736 ; VI-NEXT: v_mov_b32_e32 v1, s7
737 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
738 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
739 ; VI-NEXT: v_mov_b32_e32 v3, s1
740 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
741 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
742 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
743 ; VI-NEXT: s_waitcnt vmcnt(0)
744 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
745 ; VI-NEXT: s_waitcnt vmcnt(0)
746 ; VI-NEXT: v_mov_b32_e32 v1, s5
747 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
748 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
749 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
750 ; VI-NEXT: flat_store_dword v[0:1], v2
753 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
755 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
756 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
757 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
758 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
759 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
760 ; GFX9-NEXT: s_waitcnt vmcnt(0)
761 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
762 ; GFX9-NEXT: s_waitcnt vmcnt(0)
763 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
764 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
765 ; GFX9-NEXT: s_endpgm
767 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
769 ; GFX10-NEXT: s_clause 0x1
770 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
771 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
772 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
773 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
774 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
775 ; GFX10-NEXT: s_waitcnt vmcnt(0)
776 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
777 ; GFX10-NEXT: s_waitcnt vmcnt(0)
778 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2
779 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
780 ; GFX10-NEXT: s_endpgm
782 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
784 ; GFX11-NEXT: s_clause 0x1
785 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
786 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
787 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
788 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
789 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
790 ; GFX11-NEXT: s_waitcnt vmcnt(0)
791 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
792 ; GFX11-NEXT: s_waitcnt vmcnt(0)
793 ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2
794 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
795 ; GFX11-NEXT: s_nop 0
796 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
797 ; GFX11-NEXT: s_endpgm
798 %tid = call i32 @llvm.amdgcn.workitem.id.x()
799 %tid.ext = sext i32 %tid to i64
800 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
801 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
802 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
803 %a = load volatile float, ptr addrspace(1) %a.gep
804 %b = load volatile float, ptr addrspace(1) %b.gep
805 %fabs.a = call float @llvm.fabs.f32(float %a)
806 %neg.fabs.a = fsub float -0.0, %fabs.a
807 %neg.b = fsub float -0.0, %b
808 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
809 store <2 x half> %cvt, ptr addrspace(1) %out.gep
813 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
814 declare float @llvm.fabs.f32(float) #1
815 declare i32 @llvm.amdgcn.workitem.id.x() #1
818 attributes #0 = { nounwind }
819 attributes #1 = { nounwind readnone }