1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
8 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9 ; SI-LABEL: v_uitofp_i32_to_f32_mask255:
11 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
13 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
14 ; SI-NEXT: s_setpc_b64 s[30:31]
16 ; VI-LABEL: v_uitofp_i32_to_f32_mask255:
18 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
20 ; VI-NEXT: s_setpc_b64 s[30:31]
21 %masked = and i32 %arg0, 255
22 %cvt = uitofp i32 %masked to float
26 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
27 ; SI-LABEL: v_sitofp_i32_to_f32_mask255:
29 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
31 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
32 ; SI-NEXT: s_setpc_b64 s[30:31]
34 ; VI-LABEL: v_sitofp_i32_to_f32_mask255:
36 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
38 ; VI-NEXT: s_setpc_b64 s[30:31]
39 %masked = and i32 %arg0, 255
40 %cvt = sitofp i32 %masked to float
44 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
45 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
47 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GCN-NEXT: v_bfe_u32 v0, v0, 7, 8
49 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
50 ; GCN-NEXT: s_setpc_b64 s[30:31]
51 %lshr.7 = lshr i32 %arg0, 7
52 %masked = and i32 %lshr.7, 255
53 %cvt = uitofp i32 %masked to float
57 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
58 ; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
60 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
62 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
63 ; SI-NEXT: s_setpc_b64 s[30:31]
65 ; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
67 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
69 ; VI-NEXT: s_setpc_b64 s[30:31]
70 %lshr.8 = lshr i32 %arg0, 8
71 %masked = and i32 %lshr.8, 255
72 %cvt = uitofp i32 %masked to float
76 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
77 ; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
79 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
81 ; SI-NEXT: s_mov_b32 s6, -1
82 ; SI-NEXT: s_mov_b32 s7, 0xf000
83 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
84 ; SI-NEXT: s_waitcnt expcnt(0)
85 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
86 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
87 ; SI-NEXT: s_waitcnt vmcnt(0)
88 ; SI-NEXT: s_setpc_b64 s[30:31]
90 ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
92 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
94 ; VI-NEXT: flat_store_dword v[0:1], v0
95 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
96 ; VI-NEXT: s_waitcnt vmcnt(0)
97 ; VI-NEXT: s_setpc_b64 s[30:31]
98 %lshr.8 = lshr i32 %arg0, 8
99 store i32 %lshr.8, i32 addrspace(1)* undef
100 %masked = and i32 %lshr.8, 255
101 %cvt = uitofp i32 %masked to float
105 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
106 ; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
108 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
110 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
111 ; SI-NEXT: s_setpc_b64 s[30:31]
113 ; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
115 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
117 ; VI-NEXT: s_setpc_b64 s[30:31]
118 %lshr.16 = lshr i32 %arg0, 16
119 %masked = and i32 %lshr.16, 255
120 %cvt = uitofp i32 %masked to float
124 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
125 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
127 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
129 ; GCN-NEXT: s_setpc_b64 s[30:31]
130 %lshr.16 = lshr i32 %arg0, 24
131 %masked = and i32 %lshr.16, 255
132 %cvt = uitofp i32 %masked to float
136 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
137 ; SI-LABEL: v_uitofp_i8_to_f32:
139 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
141 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
142 ; SI-NEXT: s_setpc_b64 s[30:31]
144 ; VI-LABEL: v_uitofp_i8_to_f32:
146 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
148 ; VI-NEXT: s_setpc_b64 s[30:31]
149 %cvt = uitofp i8 %arg0 to float
153 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
154 ; SI-LABEL: v_uitofp_v2i8_to_v2f32:
156 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
158 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
159 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
160 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
161 ; SI-NEXT: v_mov_b32_e32 v0, v2
162 ; SI-NEXT: s_setpc_b64 s[30:31]
164 ; VI-LABEL: v_uitofp_v2i8_to_v2f32:
166 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
168 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
169 ; VI-NEXT: v_mov_b32_e32 v0, v2
170 ; VI-NEXT: s_setpc_b64 s[30:31]
171 %val = bitcast i16 %arg0 to <2 x i8>
172 %cvt = uitofp <2 x i8> %val to <2 x float>
176 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
177 ; SI-LABEL: v_uitofp_v3i8_to_v3f32:
179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
181 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
182 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
183 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
184 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
185 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
186 ; SI-NEXT: v_mov_b32_e32 v0, v3
187 ; SI-NEXT: s_setpc_b64 s[30:31]
189 ; VI-LABEL: v_uitofp_v3i8_to_v3f32:
191 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
193 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
194 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
195 ; VI-NEXT: v_mov_b32_e32 v0, v3
196 ; VI-NEXT: s_setpc_b64 s[30:31]
197 %trunc = trunc i32 %arg0 to i24
198 %val = bitcast i24 %trunc to <3 x i8>
199 %cvt = uitofp <3 x i8> %val to <3 x float>
203 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
204 ; SI-LABEL: v_uitofp_v4i8_to_v4f32:
206 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
208 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
209 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
210 ; SI-NEXT: v_bfe_u32 v2, v0, 16, 8
211 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
212 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
213 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
214 ; SI-NEXT: v_mov_b32_e32 v0, v4
215 ; SI-NEXT: s_setpc_b64 s[30:31]
217 ; VI-LABEL: v_uitofp_v4i8_to_v4f32:
219 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
221 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
222 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
223 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
224 ; VI-NEXT: v_mov_b32_e32 v0, v4
225 ; VI-NEXT: s_setpc_b64 s[30:31]
226 %val = bitcast i32 %arg0 to <4 x i8>
227 %cvt = uitofp <4 x i8> %val to <4 x float>
231 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
232 ; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
234 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
236 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
237 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
238 ; SI-NEXT: v_bfe_u32 v2, v0, 16, 8
239 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
240 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
241 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
242 ; SI-NEXT: v_mov_b32_e32 v0, v4
243 ; SI-NEXT: s_setpc_b64 s[30:31]
245 ; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
247 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
249 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
250 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
251 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
252 ; VI-NEXT: v_mov_b32_e32 v0, v4
253 ; VI-NEXT: s_setpc_b64 s[30:31]
254 %mask.arg0 = and i32 %arg0, 255
255 %cvt0 = uitofp i32 %mask.arg0 to float
257 %lshr.8 = lshr i32 %arg0, 8
258 %mask.lshr.8 = and i32 %lshr.8, 255
259 %cvt1 = uitofp i32 %mask.lshr.8 to float
261 %lshr.16 = lshr i32 %arg0, 16
262 %mask.lshr.16 = and i32 %lshr.16, 255
263 %cvt2 = uitofp i32 %mask.lshr.16 to float
265 %lshr.24 = lshr i32 %arg0, 24
266 %mask.lshr.24 = and i32 %lshr.24, 255
267 %cvt3 = uitofp i32 %mask.lshr.24 to float
269 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
270 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
271 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
272 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
273 ret <4 x float> %ins.3
276 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
277 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
279 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
281 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
282 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
283 ; SI-NEXT: s_setpc_b64 s[30:31]
285 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
287 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
289 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
290 ; VI-NEXT: s_setpc_b64 s[30:31]
291 %masked = and i32 %arg0, 255
292 %cvt = uitofp i32 %masked to half
296 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
297 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
299 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
301 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
302 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
303 ; SI-NEXT: s_setpc_b64 s[30:31]
305 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
307 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
309 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
310 ; VI-NEXT: s_setpc_b64 s[30:31]
311 %masked = and i32 %arg0, 255
312 %cvt = sitofp i32 %masked to half
316 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
317 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
319 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
321 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
322 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
323 ; SI-NEXT: s_setpc_b64 s[30:31]
325 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
327 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
329 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
330 ; VI-NEXT: s_setpc_b64 s[30:31]
331 %lshr.8 = lshr i32 %arg0, 8
332 %masked = and i32 %lshr.8, 255
333 %cvt = uitofp i32 %masked to half
337 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
338 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
340 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
342 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
343 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
344 ; SI-NEXT: s_setpc_b64 s[30:31]
346 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
348 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
350 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
351 ; VI-NEXT: s_setpc_b64 s[30:31]
352 %lshr.16 = lshr i32 %arg0, 16
353 %masked = and i32 %lshr.16, 255
354 %cvt = uitofp i32 %masked to half
358 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
359 ; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
361 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
363 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
364 ; GCN-NEXT: s_setpc_b64 s[30:31]
365 %lshr.16 = lshr i32 %arg0, 24
366 %masked = and i32 %lshr.16, 255
367 %cvt = uitofp i32 %masked to half
371 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
372 ; SI-LABEL: v_uitofp_i8_to_f16:
374 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
376 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
377 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
378 ; SI-NEXT: s_setpc_b64 s[30:31]
380 ; VI-LABEL: v_uitofp_i8_to_f16:
382 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
384 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
385 ; VI-NEXT: s_setpc_b64 s[30:31]
386 %cvt = uitofp i8 %arg0 to half
390 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
391 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
393 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
395 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
396 ; GCN-NEXT: s_setpc_b64 s[30:31]
397 %masked = and i32 %arg0, 255
398 %cvt = uitofp i32 %masked to double
402 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
403 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
405 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8
407 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
408 ; GCN-NEXT: s_setpc_b64 s[30:31]
409 %lshr.8 = lshr i32 %arg0, 8
410 %masked = and i32 %lshr.8, 255
411 %cvt = uitofp i32 %masked to double
415 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
416 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
418 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8
420 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
421 ; GCN-NEXT: s_setpc_b64 s[30:31]
422 %lshr.16 = lshr i32 %arg0, 16
423 %masked = and i32 %lshr.16, 255
424 %cvt = uitofp i32 %masked to double
428 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
429 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
431 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0
433 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
434 ; GCN-NEXT: s_setpc_b64 s[30:31]
435 %lshr.16 = lshr i32 %arg0, 24
436 %masked = and i32 %lshr.16, 255
437 %cvt = uitofp i32 %masked to double
441 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
442 ; GCN-LABEL: v_uitofp_i8_to_f64:
444 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
446 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
447 ; GCN-NEXT: s_setpc_b64 s[30:31]
448 %cvt = uitofp i8 %arg0 to double
452 define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
453 ; SI-LABEL: load_i8_to_f32:
455 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
456 ; SI-NEXT: s_mov_b32 s6, 0
457 ; SI-NEXT: s_mov_b32 s7, 0xf000
458 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
459 ; SI-NEXT: s_waitcnt lgkmcnt(0)
460 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
461 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
462 ; SI-NEXT: s_mov_b32 s6, -1
463 ; SI-NEXT: s_waitcnt vmcnt(0)
464 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
465 ; SI-NEXT: s_waitcnt lgkmcnt(0)
466 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
469 ; VI-LABEL: load_i8_to_f32:
471 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
472 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
473 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
474 ; VI-NEXT: s_waitcnt lgkmcnt(0)
475 ; VI-NEXT: v_mov_b32_e32 v1, s2
476 ; VI-NEXT: v_mov_b32_e32 v2, s3
477 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
478 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
479 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
480 ; VI-NEXT: s_waitcnt vmcnt(0)
481 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
482 ; VI-NEXT: v_mov_b32_e32 v0, s0
483 ; VI-NEXT: v_mov_b32_e32 v1, s1
484 ; VI-NEXT: flat_store_dword v[0:1], v2
486 %tid = call i32 @llvm.amdgcn.workitem.id.x()
487 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
488 %load = load i8, i8 addrspace(1)* %gep, align 1
489 %cvt = uitofp i8 %load to float
490 store float %cvt, float addrspace(1)* %out, align 4
495 ; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
496 ; %tid = call i32 @llvm.amdgcn.workitem.id.x()
497 ; %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
498 ; %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
499 ; %cvt = uitofp <2 x i8> %load to <2 x float>
500 ; store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
505 ; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
506 ; %tid = call i32 @llvm.amdgcn.workitem.id.x()
507 ; %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
508 ; %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
509 ; %cvt = uitofp <3 x i8> %load to <3 x float>
510 ; store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
514 ; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
515 ; %tid = call i32 @llvm.amdgcn.workitem.id.x()
516 ; %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
517 ; %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
518 ; %cvt = uitofp <4 x i8> %load to <4 x float>
519 ; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
523 ; This should not be adding instructions to shift into the correct
524 ; position in the word for the component.
526 ; FIXME: Packing bytes
527 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
528 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
530 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
531 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
532 ; SI-NEXT: v_mov_b32_e32 v1, 0
533 ; SI-NEXT: s_mov_b32 s6, 0
534 ; SI-NEXT: s_mov_b32 s7, 0xf000
535 ; SI-NEXT: s_waitcnt lgkmcnt(0)
536 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
537 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
538 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
539 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
540 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
541 ; SI-NEXT: s_mov_b32 s6, -1
542 ; SI-NEXT: s_waitcnt vmcnt(3)
543 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
544 ; SI-NEXT: s_waitcnt vmcnt(2)
545 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
546 ; SI-NEXT: s_waitcnt vmcnt(1)
547 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
548 ; SI-NEXT: s_waitcnt vmcnt(0)
549 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
550 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
551 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
552 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
553 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
554 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
555 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
556 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
557 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
558 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
559 ; SI-NEXT: s_waitcnt lgkmcnt(0)
560 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
563 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
565 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
566 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
567 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
568 ; VI-NEXT: s_waitcnt lgkmcnt(0)
569 ; VI-NEXT: v_mov_b32_e32 v0, s2
570 ; VI-NEXT: v_mov_b32_e32 v1, s3
571 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
572 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
573 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
574 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
575 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
576 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
577 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
578 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
579 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
580 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
581 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
582 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
583 ; VI-NEXT: s_waitcnt vmcnt(3)
584 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
585 ; VI-NEXT: s_waitcnt vmcnt(2)
586 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
587 ; VI-NEXT: s_waitcnt vmcnt(1)
588 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
589 ; VI-NEXT: s_waitcnt vmcnt(0)
590 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
591 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
592 ; VI-NEXT: v_or_b32_e32 v3, v1, v0
593 ; VI-NEXT: v_mov_b32_e32 v5, s1
594 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
595 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
596 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
597 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
598 ; VI-NEXT: v_mov_b32_e32 v4, s0
599 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
601 %tid = call i32 @llvm.amdgcn.workitem.id.x()
602 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
603 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
604 %cvt = uitofp <4 x i8> %load to <4 x float>
605 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
609 ; FIXME: Need to handle non-uniform case for function below (load without gep).
610 ; Instructions still emitted to repack bytes for add use.
611 ; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
612 ; %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
613 ; %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
614 ; %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
615 ; %cvt = uitofp <4 x i8> %load to <4 x float>
616 ; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
617 ; %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
618 ; store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
622 ; Make sure this doesn't crash.
624 ; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
625 ; %tid = call i32 @llvm.amdgcn.workitem.id.x()
626 ; %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
627 ; %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
628 ; %cvt = uitofp <7 x i8> %load to <7 x float>
629 ; store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
634 ; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
635 ; %tid = call i32 @llvm.amdgcn.workitem.id.x()
636 ; %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
637 ; %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
638 ; %cvt = uitofp <8 x i8> %load to <8 x float>
639 ; store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
643 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
644 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
646 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
647 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
648 ; SI-NEXT: v_mov_b32_e32 v1, 0
649 ; SI-NEXT: s_mov_b32 s6, 0
650 ; SI-NEXT: s_mov_b32 s7, 0xf000
651 ; SI-NEXT: s_waitcnt lgkmcnt(0)
652 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
653 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
654 ; SI-NEXT: s_mov_b32 s6, -1
655 ; SI-NEXT: s_waitcnt vmcnt(0)
656 ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
657 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
658 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
659 ; SI-NEXT: s_waitcnt lgkmcnt(0)
660 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
663 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
665 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
666 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
667 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
668 ; VI-NEXT: s_waitcnt lgkmcnt(0)
669 ; VI-NEXT: v_mov_b32_e32 v0, s2
670 ; VI-NEXT: v_mov_b32_e32 v1, s3
671 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
672 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
673 ; VI-NEXT: flat_load_dword v0, v[0:1]
674 ; VI-NEXT: s_waitcnt vmcnt(0)
675 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
676 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
677 ; VI-NEXT: v_mov_b32_e32 v0, s0
678 ; VI-NEXT: v_mov_b32_e32 v1, s1
679 ; VI-NEXT: flat_store_dword v[0:1], v2
681 %tid = call i32 @llvm.amdgcn.workitem.id.x()
682 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
683 %load = load i32, i32 addrspace(1)* %gep, align 4
684 %add = add i32 %load, 2
685 %inreg = and i32 %add, 255
686 %cvt = uitofp i32 %inreg to float
687 store float %cvt, float addrspace(1)* %out, align 4
691 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
692 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
694 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
695 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
696 ; SI-NEXT: v_mov_b32_e32 v1, 0
697 ; SI-NEXT: s_mov_b32 s6, 0
698 ; SI-NEXT: s_mov_b32 s7, 0xf000
699 ; SI-NEXT: s_waitcnt lgkmcnt(0)
700 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
701 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
702 ; SI-NEXT: s_mov_b32 s6, -1
703 ; SI-NEXT: s_waitcnt vmcnt(0)
704 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
705 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
706 ; SI-NEXT: s_waitcnt lgkmcnt(0)
707 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
710 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
712 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
713 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
714 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
715 ; VI-NEXT: s_waitcnt lgkmcnt(0)
716 ; VI-NEXT: v_mov_b32_e32 v0, s2
717 ; VI-NEXT: v_mov_b32_e32 v1, s3
718 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
719 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
720 ; VI-NEXT: flat_load_dword v0, v[0:1]
721 ; VI-NEXT: s_waitcnt vmcnt(0)
722 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
723 ; VI-NEXT: v_mov_b32_e32 v0, s0
724 ; VI-NEXT: v_mov_b32_e32 v1, s1
725 ; VI-NEXT: flat_store_dword v[0:1], v2
727 %tid = call i32 @llvm.amdgcn.workitem.id.x()
728 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
729 %load = load i32, i32 addrspace(1)* %gep, align 4
730 %inreg = and i32 %load, 65280
731 %shr = lshr i32 %inreg, 8
732 %cvt = uitofp i32 %shr to float
733 store float %cvt, float addrspace(1)* %out, align 4
737 ; We don't get these ones because of the zext, but instcombine removes
738 ; them so it shouldn't really matter.
739 define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
740 ; SI-LABEL: i8_zext_i32_to_f32:
742 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
743 ; SI-NEXT: s_mov_b32 s6, 0
744 ; SI-NEXT: s_mov_b32 s7, 0xf000
745 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
746 ; SI-NEXT: s_waitcnt lgkmcnt(0)
747 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
748 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
749 ; SI-NEXT: s_mov_b32 s6, -1
750 ; SI-NEXT: s_waitcnt vmcnt(0)
751 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
752 ; SI-NEXT: s_waitcnt lgkmcnt(0)
753 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
756 ; VI-LABEL: i8_zext_i32_to_f32:
758 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
759 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
760 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
761 ; VI-NEXT: s_waitcnt lgkmcnt(0)
762 ; VI-NEXT: v_mov_b32_e32 v1, s2
763 ; VI-NEXT: v_mov_b32_e32 v2, s3
764 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
765 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
766 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
767 ; VI-NEXT: s_waitcnt vmcnt(0)
768 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
769 ; VI-NEXT: v_mov_b32_e32 v0, s0
770 ; VI-NEXT: v_mov_b32_e32 v1, s1
771 ; VI-NEXT: flat_store_dword v[0:1], v2
773 %tid = call i32 @llvm.amdgcn.workitem.id.x()
774 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
775 %load = load i8, i8 addrspace(1)* %gep, align 1
776 %ext = zext i8 %load to i32
777 %cvt = uitofp i32 %ext to float
778 store float %cvt, float addrspace(1)* %out, align 4
782 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
783 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
785 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
786 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
787 ; SI-NEXT: v_mov_b32_e32 v1, 0
788 ; SI-NEXT: s_mov_b32 s6, 0
789 ; SI-NEXT: s_mov_b32 s7, 0xf000
790 ; SI-NEXT: s_waitcnt lgkmcnt(0)
791 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
792 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
793 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
794 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
795 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
796 ; SI-NEXT: s_mov_b32 s6, -1
797 ; SI-NEXT: s_waitcnt vmcnt(3)
798 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
799 ; SI-NEXT: s_waitcnt vmcnt(2)
800 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
801 ; SI-NEXT: s_waitcnt vmcnt(1)
802 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
803 ; SI-NEXT: s_waitcnt vmcnt(0)
804 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
805 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
806 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
807 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
808 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
809 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
810 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
811 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
812 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
813 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
814 ; SI-NEXT: s_waitcnt lgkmcnt(0)
815 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
818 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
820 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
821 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
822 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
823 ; VI-NEXT: s_waitcnt lgkmcnt(0)
824 ; VI-NEXT: v_mov_b32_e32 v0, s2
825 ; VI-NEXT: v_mov_b32_e32 v1, s3
826 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
827 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
828 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
829 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
830 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
831 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
832 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
833 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
834 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
835 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
836 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
837 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
838 ; VI-NEXT: s_waitcnt vmcnt(3)
839 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
840 ; VI-NEXT: s_waitcnt vmcnt(2)
841 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
842 ; VI-NEXT: s_waitcnt vmcnt(1)
843 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
844 ; VI-NEXT: s_waitcnt vmcnt(0)
845 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
846 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
847 ; VI-NEXT: v_or_b32_e32 v3, v1, v0
848 ; VI-NEXT: v_mov_b32_e32 v5, s1
849 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
850 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
851 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
852 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
853 ; VI-NEXT: v_mov_b32_e32 v4, s0
854 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
856 %tid = call i32 @llvm.amdgcn.workitem.id.x()
857 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
858 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
859 %ext = zext <4 x i8> %load to <4 x i32>
860 %cvt = uitofp <4 x i32> %ext to <4 x float>
861 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
865 define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
866 ; SI-LABEL: extract_byte0_to_f32:
868 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
869 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
870 ; SI-NEXT: v_mov_b32_e32 v1, 0
871 ; SI-NEXT: s_mov_b32 s6, 0
872 ; SI-NEXT: s_mov_b32 s7, 0xf000
873 ; SI-NEXT: s_waitcnt lgkmcnt(0)
874 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
875 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
876 ; SI-NEXT: s_mov_b32 s6, -1
877 ; SI-NEXT: s_waitcnt vmcnt(0)
878 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
879 ; SI-NEXT: s_waitcnt lgkmcnt(0)
880 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
883 ; VI-LABEL: extract_byte0_to_f32:
885 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
886 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
887 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
888 ; VI-NEXT: s_waitcnt lgkmcnt(0)
889 ; VI-NEXT: v_mov_b32_e32 v0, s2
890 ; VI-NEXT: v_mov_b32_e32 v1, s3
891 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
892 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
893 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
894 ; VI-NEXT: s_waitcnt vmcnt(0)
895 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
896 ; VI-NEXT: v_mov_b32_e32 v0, s0
897 ; VI-NEXT: v_mov_b32_e32 v1, s1
898 ; VI-NEXT: flat_store_dword v[0:1], v2
900 %tid = call i32 @llvm.amdgcn.workitem.id.x()
901 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
902 %val = load i32, i32 addrspace(1)* %gep
903 %and = and i32 %val, 255
904 %cvt = uitofp i32 %and to float
905 store float %cvt, float addrspace(1)* %out
909 define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
910 ; SI-LABEL: extract_byte1_to_f32:
912 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
913 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
914 ; SI-NEXT: v_mov_b32_e32 v1, 0
915 ; SI-NEXT: s_mov_b32 s6, 0
916 ; SI-NEXT: s_mov_b32 s7, 0xf000
917 ; SI-NEXT: s_waitcnt lgkmcnt(0)
918 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
919 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
920 ; SI-NEXT: s_mov_b32 s6, -1
921 ; SI-NEXT: s_waitcnt vmcnt(0)
922 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
923 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
924 ; SI-NEXT: s_waitcnt lgkmcnt(0)
925 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
928 ; VI-LABEL: extract_byte1_to_f32:
930 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
931 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
932 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
933 ; VI-NEXT: s_waitcnt lgkmcnt(0)
934 ; VI-NEXT: v_mov_b32_e32 v0, s2
935 ; VI-NEXT: v_mov_b32_e32 v1, s3
936 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
937 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
938 ; VI-NEXT: flat_load_dword v0, v[0:1]
939 ; VI-NEXT: s_waitcnt vmcnt(0)
940 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
941 ; VI-NEXT: v_mov_b32_e32 v0, s0
942 ; VI-NEXT: v_mov_b32_e32 v1, s1
943 ; VI-NEXT: flat_store_dword v[0:1], v2
945 %tid = call i32 @llvm.amdgcn.workitem.id.x()
946 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
947 %val = load i32, i32 addrspace(1)* %gep
948 %srl = lshr i32 %val, 8
949 %and = and i32 %srl, 255
950 %cvt = uitofp i32 %and to float
951 store float %cvt, float addrspace(1)* %out
955 define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
956 ; SI-LABEL: extract_byte2_to_f32:
958 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
959 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
960 ; SI-NEXT: v_mov_b32_e32 v1, 0
961 ; SI-NEXT: s_mov_b32 s6, 0
962 ; SI-NEXT: s_mov_b32 s7, 0xf000
963 ; SI-NEXT: s_waitcnt lgkmcnt(0)
964 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
965 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
966 ; SI-NEXT: s_mov_b32 s6, -1
967 ; SI-NEXT: s_waitcnt vmcnt(0)
968 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
969 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
970 ; SI-NEXT: s_waitcnt lgkmcnt(0)
971 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
974 ; VI-LABEL: extract_byte2_to_f32:
976 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
977 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
978 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
979 ; VI-NEXT: s_waitcnt lgkmcnt(0)
980 ; VI-NEXT: v_mov_b32_e32 v0, s2
981 ; VI-NEXT: v_mov_b32_e32 v1, s3
982 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
983 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
984 ; VI-NEXT: flat_load_dword v0, v[0:1]
985 ; VI-NEXT: s_waitcnt vmcnt(0)
986 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
987 ; VI-NEXT: v_mov_b32_e32 v0, s0
988 ; VI-NEXT: v_mov_b32_e32 v1, s1
989 ; VI-NEXT: flat_store_dword v[0:1], v2
991 %tid = call i32 @llvm.amdgcn.workitem.id.x()
992 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
993 %val = load i32, i32 addrspace(1)* %gep
994 %srl = lshr i32 %val, 16
995 %and = and i32 %srl, 255
996 %cvt = uitofp i32 %and to float
997 store float %cvt, float addrspace(1)* %out
1001 define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1002 ; SI-LABEL: extract_byte3_to_f32:
1004 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1005 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1006 ; SI-NEXT: v_mov_b32_e32 v1, 0
1007 ; SI-NEXT: s_mov_b32 s6, 0
1008 ; SI-NEXT: s_mov_b32 s7, 0xf000
1009 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1010 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1011 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1012 ; SI-NEXT: s_mov_b32 s6, -1
1013 ; SI-NEXT: s_waitcnt vmcnt(0)
1014 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
1015 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1016 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1019 ; VI-LABEL: extract_byte3_to_f32:
1021 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1022 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1023 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1024 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1025 ; VI-NEXT: v_mov_b32_e32 v0, s2
1026 ; VI-NEXT: v_mov_b32_e32 v1, s3
1027 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1028 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1029 ; VI-NEXT: flat_load_dword v0, v[0:1]
1030 ; VI-NEXT: s_waitcnt vmcnt(0)
1031 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0
1032 ; VI-NEXT: v_mov_b32_e32 v0, s0
1033 ; VI-NEXT: v_mov_b32_e32 v1, s1
1034 ; VI-NEXT: flat_store_dword v[0:1], v2
1036 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1037 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1038 %val = load i32, i32 addrspace(1)* %gep
1039 %srl = lshr i32 %val, 24
1040 %and = and i32 %srl, 255
1041 %cvt = uitofp i32 %and to float
1042 store float %cvt, float addrspace(1)* %out
1046 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
1047 ; SI-LABEL: cvt_ubyte0_or_multiuse:
1048 ; SI: ; %bb.0: ; %bb
1049 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1050 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1051 ; SI-NEXT: v_mov_b32_e32 v1, 0
1052 ; SI-NEXT: s_mov_b32 s6, 0
1053 ; SI-NEXT: s_mov_b32 s7, 0xf000
1054 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1055 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
1056 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1057 ; SI-NEXT: s_mov_b32 s6, -1
1058 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1059 ; SI-NEXT: s_waitcnt vmcnt(0)
1060 ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1061 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
1062 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1063 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
1064 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1067 ; VI-LABEL: cvt_ubyte0_or_multiuse:
1068 ; VI: ; %bb.0: ; %bb
1069 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1070 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1071 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1072 ; VI-NEXT: v_mov_b32_e32 v0, s0
1073 ; VI-NEXT: v_mov_b32_e32 v1, s1
1074 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1075 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1076 ; VI-NEXT: flat_load_dword v0, v[0:1]
1077 ; VI-NEXT: s_waitcnt vmcnt(0)
1078 ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1079 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1080 ; VI-NEXT: v_add_f32_e32 v2, v0, v1
1081 ; VI-NEXT: v_mov_b32_e32 v0, s2
1082 ; VI-NEXT: v_mov_b32_e32 v1, s3
1083 ; VI-NEXT: flat_store_dword v[0:1], v2
1086 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1087 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
1088 %load = load i32, i32 addrspace(1)* %gep
1089 %or = or i32 %load, -2147483647
1090 %and = and i32 %or, 255
1091 %uitofp = uitofp i32 %and to float
1092 %cast = bitcast i32 %or to float
1093 %add = fadd float %cast, %uitofp
1094 store float %add, float addrspace(1)* %out
1098 define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
1099 ; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
1101 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1103 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
1104 ; SI-NEXT: v_ffbh_i32_e32 v3, 0
1105 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
1106 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v3
1107 ; SI-NEXT: v_mov_b32_e32 v1, 0
1108 ; SI-NEXT: v_min_u32_e32 v2, v3, v2
1109 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
1110 ; SI-NEXT: v_min_u32_e32 v0, 1, v0
1111 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1112 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
1113 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
1114 ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
1115 ; SI-NEXT: s_setpc_b64 s[30:31]
1117 ; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
1119 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1121 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
1122 ; VI-NEXT: v_ffbh_i32_e32 v3, 0
1123 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
1124 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v3
1125 ; VI-NEXT: v_mov_b32_e32 v1, 0
1126 ; VI-NEXT: v_min_u32_e32 v2, v3, v2
1127 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
1128 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1129 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1130 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
1131 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
1132 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
1133 ; VI-NEXT: s_setpc_b64 s[30:31]
1134 %masked = and i64 %arg0, 255
1135 %itofp = sitofp i64 %masked to float
1139 define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
1140 ; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
1142 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1143 ; SI-NEXT: v_ffbh_u32_e32 v2, 0
1144 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1145 ; SI-NEXT: v_mov_b32_e32 v1, 0
1146 ; SI-NEXT: v_min_u32_e32 v2, 32, v2
1147 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
1148 ; SI-NEXT: v_min_u32_e32 v0, 1, v0
1149 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1150 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0
1151 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
1152 ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
1153 ; SI-NEXT: s_setpc_b64 s[30:31]
1155 ; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
1157 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1158 ; VI-NEXT: v_ffbh_u32_e32 v2, 0
1159 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1160 ; VI-NEXT: v_mov_b32_e32 v1, 0
1161 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1162 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
1163 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1164 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1165 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1166 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
1167 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
1168 ; VI-NEXT: s_setpc_b64 s[30:31]
1169 %masked = and i64 %arg0, 255
1170 %itofp = uitofp i64 %masked to float
1174 define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
1175 ; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
1177 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1179 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
1180 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1181 ; SI-NEXT: s_setpc_b64 s[30:31]
1183 ; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
1185 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1187 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1188 ; VI-NEXT: s_setpc_b64 s[30:31]
1189 %masked = and i16 %arg0, 255
1190 %itofp = sitofp i16 %masked to float
1194 define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
1195 ; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
1197 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1198 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1199 ; SI-NEXT: v_bfe_u32 v0, v0, 0, 16
1200 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1201 ; SI-NEXT: s_setpc_b64 s[30:31]
1203 ; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
1205 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1206 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1207 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1208 ; VI-NEXT: s_setpc_b64 s[30:31]
1209 %masked = and i16 %arg0, 255
1210 %itofp = uitofp i16 %masked to float