1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
4 declare void @f16_user(half) #0
5 declare half @f16_result() #0
7 declare void @v2f16_user(<2 x half>) #0
8 declare <2 x half> @v2f16_result() #0
10 declare void @v4f16_user(<4 x half>) #0
11 declare <4 x half> @v4f16_result() #0
13 declare void @v8f16_user(<8 x half>) #0
14 declare <8 x half> @v8f16_result() #0
16 define void @f16_arg(half %arg, ptr %ptr) #0 {
17 ; GFX7-LABEL: f16_arg:
19 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
21 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
22 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
23 ; GFX7-NEXT: flat_store_dword v[1:2], v0
24 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25 ; GFX7-NEXT: s_setpc_b64 s[30:31]
26 %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
27 store float %fpext, ptr %ptr
31 define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
32 ; GFX7-LABEL: v2f16_arg:
34 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
36 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
37 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
38 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0
39 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1
40 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
41 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2
42 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
43 ; GFX7-NEXT: flat_store_dword v[0:1], v5
44 ; GFX7-NEXT: flat_store_dword v[2:3], v4
45 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
46 ; GFX7-NEXT: s_setpc_b64 s[30:31]
47 %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
48 store <2 x float> %fpext, ptr %ptr
52 define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
53 ; GFX7-LABEL: v3f16_arg:
55 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
57 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
58 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
59 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
60 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
61 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1
62 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
63 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
64 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
65 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3
66 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
67 ; GFX7-NEXT: flat_store_dword v[0:1], v2
68 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3
69 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
70 ; GFX7-NEXT: flat_store_dword v[0:1], v6
71 ; GFX7-NEXT: flat_store_dword v[3:4], v5
72 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
73 ; GFX7-NEXT: s_setpc_b64 s[30:31]
74 %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
75 store <3 x float> %fpext, ptr %ptr
79 define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
80 ; GFX7-LABEL: v4f16_arg:
82 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
84 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
85 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
86 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
87 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v2
88 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v3
89 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
90 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
91 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
92 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
93 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
94 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
95 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4
96 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
97 ; GFX7-NEXT: flat_store_dword v[0:1], v2
98 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4
99 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
100 ; GFX7-NEXT: flat_store_dword v[0:1], v7
101 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4
102 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
103 ; GFX7-NEXT: flat_store_dword v[0:1], v3
104 ; GFX7-NEXT: flat_store_dword v[4:5], v6
105 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
106 ; GFX7-NEXT: s_setpc_b64 s[30:31]
107 %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
108 store <4 x float> %fpext, ptr %ptr
112 define half @f16_return(float %arg) #0 {
113 ; GFX7-LABEL: f16_return:
115 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
117 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
118 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
119 ; GFX7-NEXT: s_setpc_b64 s[30:31]
120 %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
124 define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
125 ; GFX7-LABEL: v2f16_return:
127 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
129 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
130 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
131 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
132 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
133 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
134 ; GFX7-NEXT: s_setpc_b64 s[30:31]
135 %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
136 ret <2 x half> %fptrunc
139 define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
140 ; GFX7-LABEL: v3f16_return:
142 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
144 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
145 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
146 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
147 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
148 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
149 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
150 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
151 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
152 ; GFX7-NEXT: s_setpc_b64 s[30:31]
153 %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
154 ret <3 x half> %fptrunc
157 define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
158 ; GFX7-LABEL: v4f16_return:
160 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
162 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
163 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
164 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
165 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
166 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
167 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
168 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
169 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
170 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
171 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
172 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
173 ; GFX7-NEXT: s_setpc_b64 s[30:31]
174 %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
175 ret <4 x half> %fptrunc
178 define void @outgoing_f16_arg(ptr %ptr) #0 {
179 ; GFX7-LABEL: outgoing_f16_arg:
181 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; GFX7-NEXT: s_mov_b32 s16, s33
183 ; GFX7-NEXT: s_mov_b32 s33, s32
184 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
185 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
186 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
187 ; GFX7-NEXT: flat_load_ushort v0, v[0:1]
188 ; GFX7-NEXT: v_writelane_b32 v40, s16, 2
189 ; GFX7-NEXT: v_writelane_b32 v40, s30, 0
190 ; GFX7-NEXT: s_mov_b32 s17, f16_user@abs32@hi
191 ; GFX7-NEXT: s_mov_b32 s16, f16_user@abs32@lo
192 ; GFX7-NEXT: s_addk_i32 s32, 0x400
193 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1
194 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
195 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
196 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
197 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1
198 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0
199 ; GFX7-NEXT: s_mov_b32 s32, s33
200 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2
201 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
202 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
203 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
204 ; GFX7-NEXT: s_mov_b32 s33, s4
205 ; GFX7-NEXT: s_waitcnt vmcnt(0)
206 ; GFX7-NEXT: s_setpc_b64 s[30:31]
207 %val = load half, ptr %ptr
208 call void @f16_user(half %val)
212 define void @outgoing_v2f16_arg(ptr %ptr) #0 {
213 ; GFX7-LABEL: outgoing_v2f16_arg:
215 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216 ; GFX7-NEXT: s_mov_b32 s16, s33
217 ; GFX7-NEXT: s_mov_b32 s33, s32
218 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
219 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
220 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
221 ; GFX7-NEXT: flat_load_dword v1, v[0:1]
222 ; GFX7-NEXT: v_writelane_b32 v40, s16, 2
223 ; GFX7-NEXT: v_writelane_b32 v40, s30, 0
224 ; GFX7-NEXT: s_mov_b32 s17, v2f16_user@abs32@hi
225 ; GFX7-NEXT: s_mov_b32 s16, v2f16_user@abs32@lo
226 ; GFX7-NEXT: s_addk_i32 s32, 0x400
227 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1
228 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
229 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
230 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
231 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
232 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
233 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1
234 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0
235 ; GFX7-NEXT: s_mov_b32 s32, s33
236 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2
237 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
238 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
239 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
240 ; GFX7-NEXT: s_mov_b32 s33, s4
241 ; GFX7-NEXT: s_waitcnt vmcnt(0)
242 ; GFX7-NEXT: s_setpc_b64 s[30:31]
243 %val = load <2 x half>, ptr %ptr
244 call void @v2f16_user(<2 x half> %val)
248 define void @outgoing_f16_return(ptr %ptr) #0 {
249 ; GFX7-LABEL: outgoing_f16_return:
251 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252 ; GFX7-NEXT: s_mov_b32 s16, s33
253 ; GFX7-NEXT: s_mov_b32 s33, s32
254 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
255 ; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
256 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
257 ; GFX7-NEXT: v_writelane_b32 v42, s16, 2
258 ; GFX7-NEXT: v_writelane_b32 v42, s30, 0
259 ; GFX7-NEXT: s_mov_b32 s17, f16_result@abs32@hi
260 ; GFX7-NEXT: s_mov_b32 s16, f16_result@abs32@lo
261 ; GFX7-NEXT: s_addk_i32 s32, 0x400
262 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
263 ; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
264 ; GFX7-NEXT: v_writelane_b32 v42, s31, 1
265 ; GFX7-NEXT: v_mov_b32_e32 v41, v1
266 ; GFX7-NEXT: v_mov_b32_e32 v40, v0
267 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
268 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
269 ; GFX7-NEXT: v_readlane_b32 s31, v42, 1
270 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0
271 ; GFX7-NEXT: s_mov_b32 s32, s33
272 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
273 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
274 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2
275 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
276 ; GFX7-NEXT: flat_store_short v[40:41], v0
277 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
278 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
279 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
280 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
281 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
282 ; GFX7-NEXT: s_mov_b32 s33, s4
283 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
284 ; GFX7-NEXT: s_setpc_b64 s[30:31]
285 %val = call half @f16_result()
286 store half %val, ptr %ptr
290 define void @outgoing_v2f16_return(ptr %ptr) #0 {
291 ; GFX7-LABEL: outgoing_v2f16_return:
293 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294 ; GFX7-NEXT: s_mov_b32 s16, s33
295 ; GFX7-NEXT: s_mov_b32 s33, s32
296 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
297 ; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
298 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
299 ; GFX7-NEXT: v_writelane_b32 v42, s16, 2
300 ; GFX7-NEXT: v_writelane_b32 v42, s30, 0
301 ; GFX7-NEXT: s_mov_b32 s17, v2f16_result@abs32@hi
302 ; GFX7-NEXT: s_mov_b32 s16, v2f16_result@abs32@lo
303 ; GFX7-NEXT: s_addk_i32 s32, 0x400
304 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
305 ; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
306 ; GFX7-NEXT: v_writelane_b32 v42, s31, 1
307 ; GFX7-NEXT: v_mov_b32_e32 v41, v1
308 ; GFX7-NEXT: v_mov_b32_e32 v40, v0
309 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
310 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
311 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
312 ; GFX7-NEXT: v_readlane_b32 s31, v42, 1
313 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0
314 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
315 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
316 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
317 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
318 ; GFX7-NEXT: s_mov_b32 s32, s33
319 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
320 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2
321 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
322 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
323 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
324 ; GFX7-NEXT: flat_store_dword v[40:41], v0
325 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
326 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
327 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
328 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
329 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
330 ; GFX7-NEXT: s_mov_b32 s33, s4
331 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332 ; GFX7-NEXT: s_setpc_b64 s[30:31]
333 %val = call <2 x half> @v2f16_result()
334 store <2 x half> %val, ptr %ptr
338 define void @outgoing_v4f16_return(ptr %ptr) #0 {
339 ; GFX7-LABEL: outgoing_v4f16_return:
341 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342 ; GFX7-NEXT: s_mov_b32 s16, s33
343 ; GFX7-NEXT: s_mov_b32 s33, s32
344 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
345 ; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
346 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
347 ; GFX7-NEXT: v_writelane_b32 v42, s16, 2
348 ; GFX7-NEXT: v_writelane_b32 v42, s30, 0
349 ; GFX7-NEXT: s_mov_b32 s17, v4f16_result@abs32@hi
350 ; GFX7-NEXT: s_mov_b32 s16, v4f16_result@abs32@lo
351 ; GFX7-NEXT: s_addk_i32 s32, 0x400
352 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
353 ; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
354 ; GFX7-NEXT: v_writelane_b32 v42, s31, 1
355 ; GFX7-NEXT: v_mov_b32_e32 v41, v1
356 ; GFX7-NEXT: v_mov_b32_e32 v40, v0
357 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
358 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
359 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
360 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
361 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
362 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
363 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
364 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
365 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
366 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
367 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
368 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
369 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
370 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
371 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
372 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
373 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
374 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
375 ; GFX7-NEXT: v_or_b32_e32 v4, v0, v1
376 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
377 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
378 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40
379 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
380 ; GFX7-NEXT: flat_store_dword v[0:1], v2
381 ; GFX7-NEXT: flat_store_dword v[40:41], v4
382 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
383 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
384 ; GFX7-NEXT: v_readlane_b32 s31, v42, 1
385 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0
386 ; GFX7-NEXT: s_mov_b32 s32, s33
387 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2
388 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
389 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
390 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
391 ; GFX7-NEXT: s_mov_b32 s33, s4
392 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
393 ; GFX7-NEXT: s_setpc_b64 s[30:31]
394 %val = call <4 x half> @v4f16_result()
395 store <4 x half> %val, ptr %ptr
399 define void @outgoing_v8f16_return(ptr %ptr) #0 {
400 ; GFX7-LABEL: outgoing_v8f16_return:
402 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403 ; GFX7-NEXT: s_mov_b32 s16, s33
404 ; GFX7-NEXT: s_mov_b32 s33, s32
405 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
406 ; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
407 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
408 ; GFX7-NEXT: v_writelane_b32 v42, s16, 2
409 ; GFX7-NEXT: v_writelane_b32 v42, s30, 0
410 ; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi
411 ; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo
412 ; GFX7-NEXT: s_addk_i32 s32, 0x400
413 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
414 ; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
415 ; GFX7-NEXT: v_writelane_b32 v42, s31, 1
416 ; GFX7-NEXT: v_mov_b32_e32 v41, v1
417 ; GFX7-NEXT: v_mov_b32_e32 v40, v0
418 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
419 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
420 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
421 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
422 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
423 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
424 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
425 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
426 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
427 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
428 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
429 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
430 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
431 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
432 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
433 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
434 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
435 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
436 ; GFX7-NEXT: v_or_b32_e32 v8, v0, v1
437 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5
438 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
439 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v1
440 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4
441 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v7
442 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6
443 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
444 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
445 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
446 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
447 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
448 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
449 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
450 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
451 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
452 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
453 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
454 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
455 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
456 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
457 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
458 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v0
459 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v40
460 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
461 ; GFX7-NEXT: flat_store_dword v[0:1], v3
462 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v40
463 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
464 ; GFX7-NEXT: flat_store_dword v[0:1], v5
465 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40
466 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc
467 ; GFX7-NEXT: flat_store_dword v[0:1], v2
468 ; GFX7-NEXT: flat_store_dword v[40:41], v8
469 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
470 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
471 ; GFX7-NEXT: v_readlane_b32 s31, v42, 1
472 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0
473 ; GFX7-NEXT: s_mov_b32 s32, s33
474 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2
475 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
476 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
477 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
478 ; GFX7-NEXT: s_mov_b32 s33, s4
479 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
480 ; GFX7-NEXT: s_setpc_b64 s[30:31]
481 %val = call <8 x half> @v8f16_result()
482 store <8 x half> %val, ptr %ptr
486 define half @call_split_type_used_outside_block_v8f16() #0 {
487 ; GFX7-LABEL: call_split_type_used_outside_block_v8f16:
488 ; GFX7: ; %bb.0: ; %bb0
489 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
490 ; GFX7-NEXT: s_mov_b32 s16, s33
491 ; GFX7-NEXT: s_mov_b32 s33, s32
492 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1
493 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
494 ; GFX7-NEXT: s_mov_b64 exec, s[18:19]
495 ; GFX7-NEXT: v_writelane_b32 v40, s16, 2
496 ; GFX7-NEXT: v_writelane_b32 v40, s30, 0
497 ; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi
498 ; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo
499 ; GFX7-NEXT: s_addk_i32 s32, 0x400
500 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1
501 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
502 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
503 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
504 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
505 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
506 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
507 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
508 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
509 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
510 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
511 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
512 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
513 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
514 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
515 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
516 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
517 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
518 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
519 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
520 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
521 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
522 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
523 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
524 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
525 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
526 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
527 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
528 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
529 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1
530 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0
531 ; GFX7-NEXT: s_mov_b32 s32, s33
532 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2
533 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
534 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
535 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
536 ; GFX7-NEXT: s_mov_b32 s33, s4
537 ; GFX7-NEXT: s_waitcnt vmcnt(0)
538 ; GFX7-NEXT: s_setpc_b64 s[30:31]
540 %split.ret.type = call <8 x half> @v8f16_result()
544 %extract = extractelement <8 x half> %split.ret.type, i32 0
548 declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
549 declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
550 declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
551 declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0
553 declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
554 declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
555 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
556 declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0
558 attributes #0 = { strictfp }