1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9
6 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
7 ; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
9 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
10 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, -1
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: v_mov_b32_e32 v0, s3
15 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
16 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
19 ; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
21 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
22 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
23 ; VI-NEXT: s_waitcnt lgkmcnt(0)
24 ; VI-NEXT: v_mov_b32_e32 v0, s2
25 ; VI-NEXT: v_mov_b32_e32 v2, s1
26 ; VI-NEXT: v_mov_b32_e32 v1, s3
27 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2
28 ; VI-NEXT: flat_store_dword v[0:1], v2
31 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
33 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
34 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
35 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
36 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
37 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
38 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
39 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2
40 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
42 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
43 store <2 x half> %result, <2 x half> addrspace(1)* %out
47 define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
48 ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
50 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
51 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
52 ; SI-NEXT: s_mov_b32 s7, 0xf000
53 ; SI-NEXT: s_mov_b32 s6, -1
54 ; SI-NEXT: s_waitcnt lgkmcnt(0)
55 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2
56 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
59 ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
61 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
62 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
63 ; VI-NEXT: s_waitcnt lgkmcnt(0)
64 ; VI-NEXT: v_mov_b32_e32 v0, s2
65 ; VI-NEXT: v_mov_b32_e32 v1, s3
66 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
67 ; VI-NEXT: flat_store_dword v[0:1], v2
70 ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
72 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
73 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
74 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
76 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
77 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
78 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
80 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
81 store <2 x half> %result, <2 x half> addrspace(1)* %out
85 define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
86 ; GCN-LABEL: s_cvt_pkrtz_undef_undef:
89 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
90 store <2 x half> %result, <2 x half> addrspace(1)* %out
94 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
95 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
97 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
98 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
99 ; SI-NEXT: s_mov_b32 s3, 0xf000
100 ; SI-NEXT: s_mov_b32 s2, 0
101 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
102 ; SI-NEXT: v_mov_b32_e32 v1, 0
103 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
104 ; SI-NEXT: s_waitcnt lgkmcnt(0)
105 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
106 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
107 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
108 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
109 ; SI-NEXT: s_waitcnt vmcnt(0)
110 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
111 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
114 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
116 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
117 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
118 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
119 ; VI-NEXT: s_waitcnt lgkmcnt(0)
120 ; VI-NEXT: v_mov_b32_e32 v1, s7
121 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
122 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
123 ; VI-NEXT: v_mov_b32_e32 v3, s1
124 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
125 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
126 ; VI-NEXT: flat_load_dword v5, v[0:1]
127 ; VI-NEXT: flat_load_dword v2, v[2:3]
128 ; VI-NEXT: v_mov_b32_e32 v1, s5
129 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
130 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
131 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
132 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
133 ; VI-NEXT: flat_store_dword v[0:1], v2
136 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
138 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
139 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
140 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
141 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
142 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
143 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
144 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
145 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
146 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
147 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
148 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
149 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
150 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
151 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
152 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
153 ; GFX9-NEXT: s_waitcnt vmcnt(0)
154 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
155 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
156 ; GFX9-NEXT: s_endpgm
157 %tid = call i32 @llvm.amdgcn.workitem.id.x()
158 %tid.ext = sext i32 %tid to i64
159 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
160 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
161 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
162 %a = load volatile float, float addrspace(1)* %a.gep
163 %b = load volatile float, float addrspace(1)* %b.gep
164 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
165 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
169 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
170 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
172 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
173 ; SI-NEXT: s_mov_b32 s7, 0xf000
174 ; SI-NEXT: s_mov_b32 s6, 0
175 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
176 ; SI-NEXT: v_mov_b32_e32 v1, 0
177 ; SI-NEXT: s_waitcnt lgkmcnt(0)
178 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
179 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
180 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
181 ; SI-NEXT: s_waitcnt vmcnt(0)
182 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
183 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
186 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
188 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
189 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
190 ; VI-NEXT: s_waitcnt lgkmcnt(0)
191 ; VI-NEXT: v_mov_b32_e32 v1, s3
192 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
193 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
194 ; VI-NEXT: flat_load_dword v3, v[0:1]
195 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
196 ; VI-NEXT: v_mov_b32_e32 v1, s1
197 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
198 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
199 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
200 ; VI-NEXT: flat_store_dword v[0:1], v2
203 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
205 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
206 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
209 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
210 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
211 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
212 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
213 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
214 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
215 ; GFX9-NEXT: s_waitcnt vmcnt(0)
216 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
217 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
218 ; GFX9-NEXT: s_endpgm
219 %tid = call i32 @llvm.amdgcn.workitem.id.x()
220 %tid.ext = sext i32 %tid to i64
221 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
222 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
223 %a = load volatile float, float addrspace(1)* %a.gep
224 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
225 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
229 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
230 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
232 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
233 ; SI-NEXT: s_mov_b32 s7, 0xf000
234 ; SI-NEXT: s_mov_b32 s6, 0
235 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
236 ; SI-NEXT: v_mov_b32_e32 v1, 0
237 ; SI-NEXT: s_waitcnt lgkmcnt(0)
238 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
239 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
240 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
241 ; SI-NEXT: s_waitcnt vmcnt(0)
242 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
243 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
246 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
248 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
249 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
250 ; VI-NEXT: s_waitcnt lgkmcnt(0)
251 ; VI-NEXT: v_mov_b32_e32 v1, s3
252 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
254 ; VI-NEXT: flat_load_dword v3, v[0:1]
255 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
256 ; VI-NEXT: v_mov_b32_e32 v1, s1
257 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
258 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
259 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
260 ; VI-NEXT: flat_store_dword v[0:1], v2
263 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
265 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
266 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
267 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
269 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
270 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
271 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
272 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
273 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
274 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
275 ; GFX9-NEXT: s_waitcnt vmcnt(0)
276 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
277 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
278 ; GFX9-NEXT: s_endpgm
279 %tid = call i32 @llvm.amdgcn.workitem.id.x()
280 %tid.ext = sext i32 %tid to i64
281 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
282 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
283 %a = load volatile float, float addrspace(1)* %a.gep
284 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
285 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
289 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
290 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
292 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
293 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
294 ; SI-NEXT: s_mov_b32 s3, 0xf000
295 ; SI-NEXT: s_mov_b32 s2, 0
296 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
297 ; SI-NEXT: v_mov_b32_e32 v1, 0
298 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
299 ; SI-NEXT: s_waitcnt lgkmcnt(0)
300 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
301 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
302 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
303 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
304 ; SI-NEXT: s_waitcnt vmcnt(0)
305 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
306 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
309 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
311 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
312 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
313 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
314 ; VI-NEXT: s_waitcnt lgkmcnt(0)
315 ; VI-NEXT: v_mov_b32_e32 v1, s7
316 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
317 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
318 ; VI-NEXT: v_mov_b32_e32 v3, s1
319 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
320 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
321 ; VI-NEXT: flat_load_dword v5, v[0:1]
322 ; VI-NEXT: flat_load_dword v2, v[2:3]
323 ; VI-NEXT: v_mov_b32_e32 v1, s5
324 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
325 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
327 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
328 ; VI-NEXT: flat_store_dword v[0:1], v2
331 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
333 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
334 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
335 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
336 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
338 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
339 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
340 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
341 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
342 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
343 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
344 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
345 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
346 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
347 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
348 ; GFX9-NEXT: s_waitcnt vmcnt(0)
349 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
350 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
351 ; GFX9-NEXT: s_endpgm
352 %tid = call i32 @llvm.amdgcn.workitem.id.x()
353 %tid.ext = sext i32 %tid to i64
354 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
355 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
356 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
357 %a = load volatile float, float addrspace(1)* %a.gep
358 %b = load volatile float, float addrspace(1)* %b.gep
359 %neg.a = fsub float -0.0, %a
360 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
361 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
365 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
366 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
368 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
369 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
370 ; SI-NEXT: s_mov_b32 s3, 0xf000
371 ; SI-NEXT: s_mov_b32 s2, 0
372 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; SI-NEXT: v_mov_b32_e32 v1, 0
374 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
375 ; SI-NEXT: s_waitcnt lgkmcnt(0)
376 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
377 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
378 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
379 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
380 ; SI-NEXT: s_waitcnt vmcnt(0)
381 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
382 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
385 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
387 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
388 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
389 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
390 ; VI-NEXT: s_waitcnt lgkmcnt(0)
391 ; VI-NEXT: v_mov_b32_e32 v1, s7
392 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
393 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
394 ; VI-NEXT: v_mov_b32_e32 v3, s1
395 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
396 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
397 ; VI-NEXT: flat_load_dword v5, v[0:1]
398 ; VI-NEXT: flat_load_dword v2, v[2:3]
399 ; VI-NEXT: v_mov_b32_e32 v1, s5
400 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
401 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
402 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
403 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
404 ; VI-NEXT: flat_store_dword v[0:1], v2
407 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
409 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
410 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
411 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
412 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
414 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
415 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
416 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
417 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
418 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
419 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
420 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
421 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
422 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
423 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
424 ; GFX9-NEXT: s_waitcnt vmcnt(0)
425 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
426 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
427 ; GFX9-NEXT: s_endpgm
428 %tid = call i32 @llvm.amdgcn.workitem.id.x()
429 %tid.ext = sext i32 %tid to i64
430 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
431 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
432 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
433 %a = load volatile float, float addrspace(1)* %a.gep
434 %b = load volatile float, float addrspace(1)* %b.gep
435 %neg.b = fsub float -0.0, %b
436 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
437 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
441 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
442 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
444 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
445 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
446 ; SI-NEXT: s_mov_b32 s3, 0xf000
447 ; SI-NEXT: s_mov_b32 s2, 0
448 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
449 ; SI-NEXT: v_mov_b32_e32 v1, 0
450 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
451 ; SI-NEXT: s_waitcnt lgkmcnt(0)
452 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
453 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
454 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
455 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
456 ; SI-NEXT: s_waitcnt vmcnt(0)
457 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
458 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
461 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
463 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
464 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
465 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
467 ; VI-NEXT: v_mov_b32_e32 v1, s7
468 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
469 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
470 ; VI-NEXT: v_mov_b32_e32 v3, s1
471 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
472 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
473 ; VI-NEXT: flat_load_dword v5, v[0:1]
474 ; VI-NEXT: flat_load_dword v2, v[2:3]
475 ; VI-NEXT: v_mov_b32_e32 v1, s5
476 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
477 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
478 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
479 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
480 ; VI-NEXT: flat_store_dword v[0:1], v2
483 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
485 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
486 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
487 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
488 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
490 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
491 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
492 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
493 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
494 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
495 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
496 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
497 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
498 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
499 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
501 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
502 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
503 ; GFX9-NEXT: s_endpgm
504 %tid = call i32 @llvm.amdgcn.workitem.id.x()
505 %tid.ext = sext i32 %tid to i64
506 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
507 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
508 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
509 %a = load volatile float, float addrspace(1)* %a.gep
510 %b = load volatile float, float addrspace(1)* %b.gep
511 %neg.a = fsub float -0.0, %a
512 %neg.b = fsub float -0.0, %b
513 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
514 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
518 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
519 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
521 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
522 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
523 ; SI-NEXT: s_mov_b32 s3, 0xf000
524 ; SI-NEXT: s_mov_b32 s2, 0
525 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
526 ; SI-NEXT: v_mov_b32_e32 v1, 0
527 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
528 ; SI-NEXT: s_waitcnt lgkmcnt(0)
529 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
530 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
531 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
532 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
533 ; SI-NEXT: s_waitcnt vmcnt(0)
534 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
535 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
538 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
540 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
541 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
542 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
543 ; VI-NEXT: s_waitcnt lgkmcnt(0)
544 ; VI-NEXT: v_mov_b32_e32 v1, s7
545 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
546 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
547 ; VI-NEXT: v_mov_b32_e32 v3, s1
548 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
550 ; VI-NEXT: flat_load_dword v5, v[0:1]
551 ; VI-NEXT: flat_load_dword v2, v[2:3]
552 ; VI-NEXT: v_mov_b32_e32 v1, s5
553 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
554 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
555 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
556 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
557 ; VI-NEXT: flat_store_dword v[0:1], v2
560 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
562 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
563 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
565 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
566 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
567 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
568 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
569 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
570 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
571 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
572 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
573 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
574 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
575 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
576 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
577 ; GFX9-NEXT: s_waitcnt vmcnt(0)
578 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
579 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
580 ; GFX9-NEXT: s_endpgm
581 %tid = call i32 @llvm.amdgcn.workitem.id.x()
582 %tid.ext = sext i32 %tid to i64
583 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
584 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
585 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
586 %a = load volatile float, float addrspace(1)* %a.gep
587 %b = load volatile float, float addrspace(1)* %b.gep
588 %fabs.a = call float @llvm.fabs.f32(float %a)
589 %neg.fabs.a = fsub float -0.0, %fabs.a
590 %neg.b = fsub float -0.0, %b
591 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
592 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
596 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
597 declare float @llvm.fabs.f32(float) #1
598 declare i32 @llvm.amdgcn.workitem.id.x() #1
601 attributes #0 = { nounwind }
602 attributes #1 = { nounwind readnone }