1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
7 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
8 ; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
10 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
11 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
12 ; SI-NEXT: s_mov_b32 s3, 0xf000
13 ; SI-NEXT: s_mov_b32 s2, -1
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: v_mov_b32_e32 v0, s5
16 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s4, v0
17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
20 ; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
22 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
23 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
24 ; VI-NEXT: s_waitcnt lgkmcnt(0)
25 ; VI-NEXT: v_mov_b32_e32 v0, s1
26 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
27 ; VI-NEXT: v_mov_b32_e32 v0, s2
28 ; VI-NEXT: v_mov_b32_e32 v1, s3
29 ; VI-NEXT: flat_store_dword v[0:1], v2
32 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
35 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
36 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
37 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
38 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
39 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, v1
40 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
43 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
45 ; GFX10-NEXT: s_clause 0x1
46 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
47 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
48 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
49 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
51 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
52 ; GFX10-NEXT: s_endpgm
53 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
54 store <2 x half> %result, <2 x half> addrspace(1)* %out
58 define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
59 ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
61 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
62 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
63 ; SI-NEXT: s_mov_b32 s3, 0xf000
64 ; SI-NEXT: s_mov_b32 s2, -1
65 ; SI-NEXT: s_waitcnt lgkmcnt(0)
66 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4
67 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
70 ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
72 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
73 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
74 ; VI-NEXT: s_waitcnt lgkmcnt(0)
75 ; VI-NEXT: v_mov_b32_e32 v0, s2
76 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
77 ; VI-NEXT: v_mov_b32_e32 v1, s3
78 ; VI-NEXT: flat_store_dword v[0:1], v2
81 ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
83 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
84 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
85 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
86 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
87 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4
88 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
91 ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
93 ; GFX10-NEXT: s_clause 0x1
94 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
95 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
96 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
97 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
98 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4
99 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
100 ; GFX10-NEXT: s_endpgm
101 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
102 store <2 x half> %result, <2 x half> addrspace(1)* %out
106 define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
107 ; GCN-LABEL: s_cvt_pkrtz_undef_undef:
111 ; GFX10-LABEL: s_cvt_pkrtz_undef_undef:
113 ; GFX10-NEXT: s_endpgm
114 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
115 store <2 x half> %result, <2 x half> addrspace(1)* %out
119 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
120 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
122 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
123 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
124 ; SI-NEXT: s_mov_b32 s11, 0xf000
125 ; SI-NEXT: s_mov_b32 s10, 0
126 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
127 ; SI-NEXT: v_mov_b32_e32 v1, 0
128 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
129 ; SI-NEXT: s_waitcnt lgkmcnt(0)
130 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
131 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
132 ; SI-NEXT: s_waitcnt vmcnt(0)
133 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
134 ; SI-NEXT: s_waitcnt vmcnt(0)
135 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
136 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
137 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
140 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
142 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
143 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
144 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
145 ; VI-NEXT: s_waitcnt lgkmcnt(0)
146 ; VI-NEXT: v_mov_b32_e32 v1, s7
147 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
148 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
149 ; VI-NEXT: v_mov_b32_e32 v3, s1
150 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
151 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
152 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
153 ; VI-NEXT: s_waitcnt vmcnt(0)
154 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
155 ; VI-NEXT: s_waitcnt vmcnt(0)
156 ; VI-NEXT: v_mov_b32_e32 v5, s5
157 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
158 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
159 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
160 ; VI-NEXT: flat_store_dword v[4:5], v0
163 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
165 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
166 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
167 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
169 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
171 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
172 ; GFX9-NEXT: s_waitcnt vmcnt(0)
173 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2
174 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
175 ; GFX9-NEXT: s_endpgm
177 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32:
179 ; GFX10-NEXT: s_clause 0x1
180 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
181 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
182 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
183 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
185 ; GFX10-NEXT: s_waitcnt vmcnt(0)
186 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
187 ; GFX10-NEXT: s_waitcnt vmcnt(0)
188 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2
189 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
190 ; GFX10-NEXT: s_endpgm
191 %tid = call i32 @llvm.amdgcn.workitem.id.x()
192 %tid.ext = sext i32 %tid to i64
193 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
194 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
195 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
196 %a = load volatile float, float addrspace(1)* %a.gep
197 %b = load volatile float, float addrspace(1)* %b.gep
198 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
199 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
203 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
204 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
206 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
207 ; SI-NEXT: s_mov_b32 s7, 0xf000
208 ; SI-NEXT: s_mov_b32 s6, 0
209 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
210 ; SI-NEXT: v_mov_b32_e32 v1, 0
211 ; SI-NEXT: s_waitcnt lgkmcnt(0)
212 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
213 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
214 ; SI-NEXT: s_waitcnt vmcnt(0)
215 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
216 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
217 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
220 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
222 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
223 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
224 ; VI-NEXT: s_waitcnt lgkmcnt(0)
225 ; VI-NEXT: v_mov_b32_e32 v1, s3
226 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
227 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
228 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
229 ; VI-NEXT: s_waitcnt vmcnt(0)
230 ; VI-NEXT: v_mov_b32_e32 v3, s1
231 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
232 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
233 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
234 ; VI-NEXT: flat_store_dword v[2:3], v0
237 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
239 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
240 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
241 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
243 ; GFX9-NEXT: s_waitcnt vmcnt(0)
244 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
245 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
246 ; GFX9-NEXT: s_endpgm
248 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
250 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
251 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
252 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
254 ; GFX10-NEXT: s_waitcnt vmcnt(0)
255 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
256 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
257 ; GFX10-NEXT: s_endpgm
258 %tid = call i32 @llvm.amdgcn.workitem.id.x()
259 %tid.ext = sext i32 %tid to i64
260 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
261 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
262 %a = load volatile float, float addrspace(1)* %a.gep
263 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
264 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
268 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
269 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
271 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
272 ; SI-NEXT: s_mov_b32 s7, 0xf000
273 ; SI-NEXT: s_mov_b32 s6, 0
274 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
275 ; SI-NEXT: v_mov_b32_e32 v1, 0
276 ; SI-NEXT: s_waitcnt lgkmcnt(0)
277 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
278 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
279 ; SI-NEXT: s_waitcnt vmcnt(0)
280 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
281 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
282 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
285 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
287 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
288 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
289 ; VI-NEXT: s_waitcnt lgkmcnt(0)
290 ; VI-NEXT: v_mov_b32_e32 v1, s3
291 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
292 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
293 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
294 ; VI-NEXT: s_waitcnt vmcnt(0)
295 ; VI-NEXT: v_mov_b32_e32 v3, s1
296 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
297 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
298 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
299 ; VI-NEXT: flat_store_dword v[2:3], v0
302 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
304 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
305 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
306 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
307 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
310 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
311 ; GFX9-NEXT: s_endpgm
313 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
315 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
316 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
317 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
318 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
319 ; GFX10-NEXT: s_waitcnt vmcnt(0)
320 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
321 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
322 ; GFX10-NEXT: s_endpgm
323 %tid = call i32 @llvm.amdgcn.workitem.id.x()
324 %tid.ext = sext i32 %tid to i64
325 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
326 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
327 %a = load volatile float, float addrspace(1)* %a.gep
328 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
329 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
333 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
334 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
336 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
337 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
338 ; SI-NEXT: s_mov_b32 s11, 0xf000
339 ; SI-NEXT: s_mov_b32 s10, 0
340 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
341 ; SI-NEXT: v_mov_b32_e32 v1, 0
342 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
343 ; SI-NEXT: s_waitcnt lgkmcnt(0)
344 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
345 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
346 ; SI-NEXT: s_waitcnt vmcnt(0)
347 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
348 ; SI-NEXT: s_waitcnt vmcnt(0)
349 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
350 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
351 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
354 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
356 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
357 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
358 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
359 ; VI-NEXT: s_waitcnt lgkmcnt(0)
360 ; VI-NEXT: v_mov_b32_e32 v1, s7
361 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
362 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
363 ; VI-NEXT: v_mov_b32_e32 v3, s1
364 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
365 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
366 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
367 ; VI-NEXT: s_waitcnt vmcnt(0)
368 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
369 ; VI-NEXT: s_waitcnt vmcnt(0)
370 ; VI-NEXT: v_mov_b32_e32 v5, s5
371 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
372 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
373 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
374 ; VI-NEXT: flat_store_dword v[4:5], v0
377 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
379 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
380 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
381 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
382 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
383 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
384 ; GFX9-NEXT: s_waitcnt vmcnt(0)
385 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
386 ; GFX9-NEXT: s_waitcnt vmcnt(0)
387 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2
388 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
389 ; GFX9-NEXT: s_endpgm
391 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
393 ; GFX10-NEXT: s_clause 0x1
394 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
395 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
396 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
397 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
399 ; GFX10-NEXT: s_waitcnt vmcnt(0)
400 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
401 ; GFX10-NEXT: s_waitcnt vmcnt(0)
402 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2
403 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
404 ; GFX10-NEXT: s_endpgm
405 %tid = call i32 @llvm.amdgcn.workitem.id.x()
406 %tid.ext = sext i32 %tid to i64
407 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
408 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
409 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
410 %a = load volatile float, float addrspace(1)* %a.gep
411 %b = load volatile float, float addrspace(1)* %b.gep
412 %neg.a = fsub float -0.0, %a
413 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
414 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
418 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
419 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
421 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
422 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
423 ; SI-NEXT: s_mov_b32 s11, 0xf000
424 ; SI-NEXT: s_mov_b32 s10, 0
425 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
426 ; SI-NEXT: v_mov_b32_e32 v1, 0
427 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
428 ; SI-NEXT: s_waitcnt lgkmcnt(0)
429 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
430 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
431 ; SI-NEXT: s_waitcnt vmcnt(0)
432 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
433 ; SI-NEXT: s_waitcnt vmcnt(0)
434 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
435 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
436 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
439 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
441 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
442 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
443 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
444 ; VI-NEXT: s_waitcnt lgkmcnt(0)
445 ; VI-NEXT: v_mov_b32_e32 v1, s7
446 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
447 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
448 ; VI-NEXT: v_mov_b32_e32 v3, s1
449 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
450 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
451 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
452 ; VI-NEXT: s_waitcnt vmcnt(0)
453 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
454 ; VI-NEXT: s_waitcnt vmcnt(0)
455 ; VI-NEXT: v_mov_b32_e32 v5, s5
456 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
457 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
458 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
459 ; VI-NEXT: flat_store_dword v[4:5], v0
462 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
464 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
465 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
466 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
467 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
468 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
471 ; GFX9-NEXT: s_waitcnt vmcnt(0)
472 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2
473 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
474 ; GFX9-NEXT: s_endpgm
476 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
478 ; GFX10-NEXT: s_clause 0x1
479 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
480 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
481 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
482 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
484 ; GFX10-NEXT: s_waitcnt vmcnt(0)
485 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
486 ; GFX10-NEXT: s_waitcnt vmcnt(0)
487 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2
488 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
489 ; GFX10-NEXT: s_endpgm
490 %tid = call i32 @llvm.amdgcn.workitem.id.x()
491 %tid.ext = sext i32 %tid to i64
492 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
493 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
494 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
495 %a = load volatile float, float addrspace(1)* %a.gep
496 %b = load volatile float, float addrspace(1)* %b.gep
497 %neg.b = fsub float -0.0, %b
498 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
499 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
503 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
504 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
506 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
507 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
508 ; SI-NEXT: s_mov_b32 s11, 0xf000
509 ; SI-NEXT: s_mov_b32 s10, 0
510 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
511 ; SI-NEXT: v_mov_b32_e32 v1, 0
512 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
513 ; SI-NEXT: s_waitcnt lgkmcnt(0)
514 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
515 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
516 ; SI-NEXT: s_waitcnt vmcnt(0)
517 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
518 ; SI-NEXT: s_waitcnt vmcnt(0)
519 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
520 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
521 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
524 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
526 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
527 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
528 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
529 ; VI-NEXT: s_waitcnt lgkmcnt(0)
530 ; VI-NEXT: v_mov_b32_e32 v1, s7
531 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
532 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
533 ; VI-NEXT: v_mov_b32_e32 v3, s1
534 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
535 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
536 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
537 ; VI-NEXT: s_waitcnt vmcnt(0)
538 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
539 ; VI-NEXT: s_waitcnt vmcnt(0)
540 ; VI-NEXT: v_mov_b32_e32 v5, s5
541 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
542 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
543 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
544 ; VI-NEXT: flat_store_dword v[4:5], v0
547 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
549 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
550 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
551 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
552 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
553 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
554 ; GFX9-NEXT: s_waitcnt vmcnt(0)
555 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
556 ; GFX9-NEXT: s_waitcnt vmcnt(0)
557 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2
558 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
559 ; GFX9-NEXT: s_endpgm
561 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
563 ; GFX10-NEXT: s_clause 0x1
564 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
565 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
566 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
567 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
569 ; GFX10-NEXT: s_waitcnt vmcnt(0)
570 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
571 ; GFX10-NEXT: s_waitcnt vmcnt(0)
572 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2
573 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
574 ; GFX10-NEXT: s_endpgm
575 %tid = call i32 @llvm.amdgcn.workitem.id.x()
576 %tid.ext = sext i32 %tid to i64
577 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
578 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
579 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
580 %a = load volatile float, float addrspace(1)* %a.gep
581 %b = load volatile float, float addrspace(1)* %b.gep
582 %neg.a = fsub float -0.0, %a
583 %neg.b = fsub float -0.0, %b
584 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
585 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
589 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
590 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
592 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
593 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
594 ; SI-NEXT: s_mov_b32 s11, 0xf000
595 ; SI-NEXT: s_mov_b32 s10, 0
596 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
597 ; SI-NEXT: v_mov_b32_e32 v1, 0
598 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
599 ; SI-NEXT: s_waitcnt lgkmcnt(0)
600 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
601 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
602 ; SI-NEXT: s_waitcnt vmcnt(0)
603 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
604 ; SI-NEXT: s_waitcnt vmcnt(0)
605 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
606 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
607 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
610 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
612 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
613 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
614 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
615 ; VI-NEXT: s_waitcnt lgkmcnt(0)
616 ; VI-NEXT: v_mov_b32_e32 v1, s7
617 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
618 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
619 ; VI-NEXT: v_mov_b32_e32 v3, s1
620 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
621 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
622 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
623 ; VI-NEXT: s_waitcnt vmcnt(0)
624 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
625 ; VI-NEXT: s_waitcnt vmcnt(0)
626 ; VI-NEXT: v_mov_b32_e32 v5, s5
627 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
628 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
629 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
630 ; VI-NEXT: flat_store_dword v[4:5], v0
633 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
635 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
636 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
637 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
638 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
639 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
640 ; GFX9-NEXT: s_waitcnt vmcnt(0)
641 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
642 ; GFX9-NEXT: s_waitcnt vmcnt(0)
643 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
644 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
645 ; GFX9-NEXT: s_endpgm
647 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
649 ; GFX10-NEXT: s_clause 0x1
650 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
651 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
652 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
653 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
654 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
655 ; GFX10-NEXT: s_waitcnt vmcnt(0)
656 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
657 ; GFX10-NEXT: s_waitcnt vmcnt(0)
658 ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2
659 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
660 ; GFX10-NEXT: s_endpgm
661 %tid = call i32 @llvm.amdgcn.workitem.id.x()
662 %tid.ext = sext i32 %tid to i64
663 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
664 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
665 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
666 %a = load volatile float, float addrspace(1)* %a.gep
667 %b = load volatile float, float addrspace(1)* %b.gep
668 %fabs.a = call float @llvm.fabs.f32(float %a)
669 %neg.fabs.a = fsub float -0.0, %fabs.a
670 %neg.b = fsub float -0.0, %b
671 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
672 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
676 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
677 declare float @llvm.fabs.f32(float) #1
678 declare i32 @llvm.amdgcn.workitem.id.x() #1
681 attributes #0 = { nounwind }
682 attributes #1 = { nounwind readnone }