1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
8 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9 ; SI-LABEL: v_uitofp_i32_to_f32_mask255:
11 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
13 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
14 ; SI-NEXT: s_setpc_b64 s[30:31]
16 ; VI-LABEL: v_uitofp_i32_to_f32_mask255:
18 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
20 ; VI-NEXT: s_setpc_b64 s[30:31]
21 %masked = and i32 %arg0, 255
22 %cvt = uitofp i32 %masked to float
26 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
27 ; SI-LABEL: v_sitofp_i32_to_f32_mask255:
29 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
31 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
32 ; SI-NEXT: s_setpc_b64 s[30:31]
34 ; VI-LABEL: v_sitofp_i32_to_f32_mask255:
36 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
38 ; VI-NEXT: s_setpc_b64 s[30:31]
39 %masked = and i32 %arg0, 255
40 %cvt = sitofp i32 %masked to float
44 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
45 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
47 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GCN-NEXT: v_bfe_u32 v0, v0, 7, 8
49 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
50 ; GCN-NEXT: s_setpc_b64 s[30:31]
51 %lshr.7 = lshr i32 %arg0, 7
52 %masked = and i32 %lshr.7, 255
53 %cvt = uitofp i32 %masked to float
57 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
58 ; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
60 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
62 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
63 ; SI-NEXT: s_setpc_b64 s[30:31]
65 ; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
67 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
69 ; VI-NEXT: s_setpc_b64 s[30:31]
70 %lshr.8 = lshr i32 %arg0, 8
71 %masked = and i32 %lshr.8, 255
72 %cvt = uitofp i32 %masked to float
76 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
77 ; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
79 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
81 ; SI-NEXT: s_mov_b32 s6, -1
82 ; SI-NEXT: s_mov_b32 s7, 0xf000
83 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
84 ; SI-NEXT: s_waitcnt expcnt(0)
85 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
86 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
87 ; SI-NEXT: s_waitcnt vmcnt(0)
88 ; SI-NEXT: s_setpc_b64 s[30:31]
90 ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
92 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
94 ; VI-NEXT: flat_store_dword v[0:1], v0
95 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
96 ; VI-NEXT: s_waitcnt vmcnt(0)
97 ; VI-NEXT: s_setpc_b64 s[30:31]
98 %lshr.8 = lshr i32 %arg0, 8
99 store i32 %lshr.8, ptr addrspace(1) undef
100 %masked = and i32 %lshr.8, 255
101 %cvt = uitofp i32 %masked to float
105 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
106 ; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
108 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
110 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
111 ; SI-NEXT: s_setpc_b64 s[30:31]
113 ; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
115 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
117 ; VI-NEXT: s_setpc_b64 s[30:31]
118 %lshr.16 = lshr i32 %arg0, 16
119 %masked = and i32 %lshr.16, 255
120 %cvt = uitofp i32 %masked to float
124 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
125 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
127 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
129 ; GCN-NEXT: s_setpc_b64 s[30:31]
130 %lshr.16 = lshr i32 %arg0, 24
131 %masked = and i32 %lshr.16, 255
132 %cvt = uitofp i32 %masked to float
136 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
137 ; SI-LABEL: v_uitofp_i8_to_f32:
139 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
141 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
142 ; SI-NEXT: s_setpc_b64 s[30:31]
144 ; VI-LABEL: v_uitofp_i8_to_f32:
146 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
148 ; VI-NEXT: s_setpc_b64 s[30:31]
149 %cvt = uitofp i8 %arg0 to float
153 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
154 ; SI-LABEL: v_uitofp_v2i8_to_v2f32:
156 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
158 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
159 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
160 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
161 ; SI-NEXT: v_mov_b32_e32 v0, v2
162 ; SI-NEXT: s_setpc_b64 s[30:31]
164 ; VI-LABEL: v_uitofp_v2i8_to_v2f32:
166 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
168 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
169 ; VI-NEXT: v_mov_b32_e32 v0, v2
170 ; VI-NEXT: s_setpc_b64 s[30:31]
171 %val = bitcast i16 %arg0 to <2 x i8>
172 %cvt = uitofp <2 x i8> %val to <2 x float>
176 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
177 ; SI-LABEL: v_uitofp_v3i8_to_v3f32:
179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
181 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
182 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
183 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
184 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
185 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
186 ; SI-NEXT: v_mov_b32_e32 v0, v3
187 ; SI-NEXT: s_setpc_b64 s[30:31]
189 ; VI-LABEL: v_uitofp_v3i8_to_v3f32:
191 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
193 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
194 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
195 ; VI-NEXT: v_mov_b32_e32 v0, v3
196 ; VI-NEXT: s_setpc_b64 s[30:31]
197 %trunc = trunc i32 %arg0 to i24
198 %val = bitcast i24 %trunc to <3 x i8>
199 %cvt = uitofp <3 x i8> %val to <3 x float>
203 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
204 ; SI-LABEL: v_uitofp_v4i8_to_v4f32:
206 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
208 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
209 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
210 ; SI-NEXT: v_bfe_u32 v2, v0, 16, 8
211 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
212 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
213 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
214 ; SI-NEXT: v_mov_b32_e32 v0, v4
215 ; SI-NEXT: s_setpc_b64 s[30:31]
217 ; VI-LABEL: v_uitofp_v4i8_to_v4f32:
219 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
221 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
222 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
223 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
224 ; VI-NEXT: v_mov_b32_e32 v0, v4
225 ; VI-NEXT: s_setpc_b64 s[30:31]
226 %val = bitcast i32 %arg0 to <4 x i8>
227 %cvt = uitofp <4 x i8> %val to <4 x float>
231 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
232 ; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
234 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
236 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
237 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
238 ; SI-NEXT: v_bfe_u32 v2, v0, 16, 8
239 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
240 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
241 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
242 ; SI-NEXT: v_mov_b32_e32 v0, v4
243 ; SI-NEXT: s_setpc_b64 s[30:31]
245 ; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
247 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
249 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
250 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
251 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
252 ; VI-NEXT: v_mov_b32_e32 v0, v4
253 ; VI-NEXT: s_setpc_b64 s[30:31]
254 %mask.arg0 = and i32 %arg0, 255
255 %cvt0 = uitofp i32 %mask.arg0 to float
257 %lshr.8 = lshr i32 %arg0, 8
258 %mask.lshr.8 = and i32 %lshr.8, 255
259 %cvt1 = uitofp i32 %mask.lshr.8 to float
261 %lshr.16 = lshr i32 %arg0, 16
262 %mask.lshr.16 = and i32 %lshr.16, 255
263 %cvt2 = uitofp i32 %mask.lshr.16 to float
265 %lshr.24 = lshr i32 %arg0, 24
266 %mask.lshr.24 = and i32 %lshr.24, 255
267 %cvt3 = uitofp i32 %mask.lshr.24 to float
269 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
270 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
271 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
272 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
273 ret <4 x float> %ins.3
276 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
277 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
279 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
281 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
282 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
283 ; SI-NEXT: s_setpc_b64 s[30:31]
285 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
287 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
289 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
290 ; VI-NEXT: s_setpc_b64 s[30:31]
291 %masked = and i32 %arg0, 255
292 %cvt = uitofp i32 %masked to half
296 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
297 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
299 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
301 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
302 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
303 ; SI-NEXT: s_setpc_b64 s[30:31]
305 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
307 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
309 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
310 ; VI-NEXT: s_setpc_b64 s[30:31]
311 %masked = and i32 %arg0, 255
312 %cvt = sitofp i32 %masked to half
316 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
317 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
319 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
321 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
322 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
323 ; SI-NEXT: s_setpc_b64 s[30:31]
325 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
327 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
329 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
330 ; VI-NEXT: s_setpc_b64 s[30:31]
331 %lshr.8 = lshr i32 %arg0, 8
332 %masked = and i32 %lshr.8, 255
333 %cvt = uitofp i32 %masked to half
337 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
338 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
340 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
342 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
343 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
344 ; SI-NEXT: s_setpc_b64 s[30:31]
346 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
348 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
350 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
351 ; VI-NEXT: s_setpc_b64 s[30:31]
352 %lshr.16 = lshr i32 %arg0, 16
353 %masked = and i32 %lshr.16, 255
354 %cvt = uitofp i32 %masked to half
358 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
359 ; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
361 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
363 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
364 ; GCN-NEXT: s_setpc_b64 s[30:31]
365 %lshr.16 = lshr i32 %arg0, 24
366 %masked = and i32 %lshr.16, 255
367 %cvt = uitofp i32 %masked to half
371 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
372 ; SI-LABEL: v_uitofp_i8_to_f16:
374 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
376 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
377 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
378 ; SI-NEXT: s_setpc_b64 s[30:31]
380 ; VI-LABEL: v_uitofp_i8_to_f16:
382 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
384 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
385 ; VI-NEXT: s_setpc_b64 s[30:31]
386 %cvt = uitofp i8 %arg0 to half
390 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
391 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
393 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
395 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
396 ; GCN-NEXT: s_setpc_b64 s[30:31]
397 %masked = and i32 %arg0, 255
398 %cvt = uitofp i32 %masked to double
402 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
403 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
405 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8
407 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
408 ; GCN-NEXT: s_setpc_b64 s[30:31]
409 %lshr.8 = lshr i32 %arg0, 8
410 %masked = and i32 %lshr.8, 255
411 %cvt = uitofp i32 %masked to double
415 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
416 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
418 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8
420 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
421 ; GCN-NEXT: s_setpc_b64 s[30:31]
422 %lshr.16 = lshr i32 %arg0, 16
423 %masked = and i32 %lshr.16, 255
424 %cvt = uitofp i32 %masked to double
428 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
429 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
431 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0
433 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
434 ; GCN-NEXT: s_setpc_b64 s[30:31]
435 %lshr.16 = lshr i32 %arg0, 24
436 %masked = and i32 %lshr.16, 255
437 %cvt = uitofp i32 %masked to double
441 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
442 ; GCN-LABEL: v_uitofp_i8_to_f64:
444 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
446 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
447 ; GCN-NEXT: s_setpc_b64 s[30:31]
448 %cvt = uitofp i8 %arg0 to double
452 define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
453 ; SI-LABEL: load_i8_to_f32:
455 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
456 ; SI-NEXT: s_mov_b32 s6, 0
457 ; SI-NEXT: s_mov_b32 s7, 0xf000
458 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
459 ; SI-NEXT: s_waitcnt lgkmcnt(0)
460 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
461 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
462 ; SI-NEXT: s_mov_b32 s6, -1
463 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
464 ; SI-NEXT: s_waitcnt vmcnt(0)
465 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
466 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
469 ; VI-LABEL: load_i8_to_f32:
471 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
472 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
473 ; VI-NEXT: s_waitcnt lgkmcnt(0)
474 ; VI-NEXT: v_mov_b32_e32 v1, s2
475 ; VI-NEXT: v_mov_b32_e32 v2, s3
476 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
477 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
478 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
479 ; VI-NEXT: s_waitcnt vmcnt(0)
480 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
481 ; VI-NEXT: v_mov_b32_e32 v0, s0
482 ; VI-NEXT: v_mov_b32_e32 v1, s1
483 ; VI-NEXT: flat_store_dword v[0:1], v2
485 %tid = call i32 @llvm.amdgcn.workitem.id.x()
486 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
487 %load = load i8, ptr addrspace(1) %gep, align 1
488 %cvt = uitofp i8 %load to float
489 store float %cvt, ptr addrspace(1) %out, align 4
493 define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
494 ; SI-LABEL: load_v2i8_to_v2f32:
496 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
497 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
498 ; SI-NEXT: v_mov_b32_e32 v1, 0
499 ; SI-NEXT: s_mov_b32 s6, 0
500 ; SI-NEXT: s_mov_b32 s7, 0xf000
501 ; SI-NEXT: s_waitcnt lgkmcnt(0)
502 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
503 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
504 ; SI-NEXT: s_mov_b32 s6, -1
505 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
506 ; SI-NEXT: s_waitcnt vmcnt(0)
507 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
508 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
509 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
510 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
511 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
514 ; VI-LABEL: load_v2i8_to_v2f32:
516 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
517 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
518 ; VI-NEXT: s_waitcnt lgkmcnt(0)
519 ; VI-NEXT: v_mov_b32_e32 v0, s2
520 ; VI-NEXT: v_mov_b32_e32 v1, s3
521 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
522 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
523 ; VI-NEXT: flat_load_ushort v1, v[0:1]
524 ; VI-NEXT: v_mov_b32_e32 v3, s1
525 ; VI-NEXT: v_mov_b32_e32 v2, s0
526 ; VI-NEXT: s_waitcnt vmcnt(0)
527 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
528 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
529 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
531 %tid = call i32 @llvm.amdgcn.workitem.id.x()
532 %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
533 %load = load <2 x i8>, ptr addrspace(1) %gep, align 2
534 %cvt = uitofp <2 x i8> %load to <2 x float>
535 store <2 x float> %cvt, ptr addrspace(1) %out, align 16
539 define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
540 ; SI-LABEL: load_v3i8_to_v3f32:
542 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
543 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
544 ; SI-NEXT: v_mov_b32_e32 v1, 0
545 ; SI-NEXT: s_mov_b32 s6, 0
546 ; SI-NEXT: s_mov_b32 s7, 0xf000
547 ; SI-NEXT: s_waitcnt lgkmcnt(0)
548 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
549 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
550 ; SI-NEXT: s_mov_b32 s6, -1
551 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
552 ; SI-NEXT: s_waitcnt vmcnt(0)
553 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
554 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
555 ; SI-NEXT: v_bfe_u32 v3, v0, 16, 8
556 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
557 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
558 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
559 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
560 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
563 ; VI-LABEL: load_v3i8_to_v3f32:
565 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
566 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
567 ; VI-NEXT: s_waitcnt lgkmcnt(0)
568 ; VI-NEXT: v_mov_b32_e32 v0, s2
569 ; VI-NEXT: v_mov_b32_e32 v1, s3
570 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
571 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
572 ; VI-NEXT: flat_load_dword v2, v[0:1]
573 ; VI-NEXT: v_mov_b32_e32 v4, s1
574 ; VI-NEXT: v_mov_b32_e32 v3, s0
575 ; VI-NEXT: s_waitcnt vmcnt(0)
576 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
577 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
578 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
579 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
581 %tid = call i32 @llvm.amdgcn.workitem.id.x()
582 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
583 %load = load <3 x i8>, ptr addrspace(1) %gep, align 4
584 %cvt = uitofp <3 x i8> %load to <3 x float>
585 store <3 x float> %cvt, ptr addrspace(1) %out, align 16
589 define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
590 ; SI-LABEL: load_v4i8_to_v4f32:
592 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
593 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
594 ; SI-NEXT: v_mov_b32_e32 v1, 0
595 ; SI-NEXT: s_mov_b32 s6, 0
596 ; SI-NEXT: s_mov_b32 s7, 0xf000
597 ; SI-NEXT: s_waitcnt lgkmcnt(0)
598 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
599 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
600 ; SI-NEXT: s_mov_b32 s6, -1
601 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
602 ; SI-NEXT: s_waitcnt vmcnt(0)
603 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
604 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
605 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
606 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
607 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
608 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
609 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
610 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
613 ; VI-LABEL: load_v4i8_to_v4f32:
615 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
616 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
617 ; VI-NEXT: s_waitcnt lgkmcnt(0)
618 ; VI-NEXT: v_mov_b32_e32 v0, s2
619 ; VI-NEXT: v_mov_b32_e32 v1, s3
620 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
621 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
622 ; VI-NEXT: flat_load_dword v3, v[0:1]
623 ; VI-NEXT: v_mov_b32_e32 v5, s1
624 ; VI-NEXT: v_mov_b32_e32 v4, s0
625 ; VI-NEXT: s_waitcnt vmcnt(0)
626 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
627 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
628 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
629 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
630 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
632 %tid = call i32 @llvm.amdgcn.workitem.id.x()
633 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
634 %load = load <4 x i8>, ptr addrspace(1) %gep, align 4
635 %cvt = uitofp <4 x i8> %load to <4 x float>
636 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
640 ; This should not be adding instructions to shift into the correct
641 ; position in the word for the component.
643 ; FIXME: Packing bytes
644 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
645 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
647 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
648 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
649 ; SI-NEXT: v_mov_b32_e32 v1, 0
650 ; SI-NEXT: s_mov_b32 s6, 0
651 ; SI-NEXT: s_mov_b32 s7, 0xf000
652 ; SI-NEXT: s_waitcnt lgkmcnt(0)
653 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
654 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
655 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
656 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
657 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
658 ; SI-NEXT: s_mov_b32 s6, -1
659 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
660 ; SI-NEXT: s_waitcnt vmcnt(3)
661 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
662 ; SI-NEXT: s_waitcnt vmcnt(2)
663 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
664 ; SI-NEXT: s_waitcnt vmcnt(1)
665 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
666 ; SI-NEXT: s_waitcnt vmcnt(0)
667 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
668 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
669 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
670 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
671 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
672 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
673 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
674 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
675 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
676 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
677 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
680 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
682 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
683 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
684 ; VI-NEXT: s_waitcnt lgkmcnt(0)
685 ; VI-NEXT: v_mov_b32_e32 v0, s2
686 ; VI-NEXT: v_mov_b32_e32 v1, s3
687 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
688 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
689 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
690 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
691 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
692 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
693 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
694 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
695 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
696 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
697 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
698 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
699 ; VI-NEXT: s_waitcnt vmcnt(3)
700 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
701 ; VI-NEXT: s_waitcnt vmcnt(2)
702 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
703 ; VI-NEXT: s_waitcnt vmcnt(1)
704 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
705 ; VI-NEXT: s_waitcnt vmcnt(0)
706 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
707 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
708 ; VI-NEXT: v_or_b32_e32 v3, v1, v0
709 ; VI-NEXT: v_mov_b32_e32 v5, s1
710 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
711 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
712 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
713 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
714 ; VI-NEXT: v_mov_b32_e32 v4, s0
715 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
717 %tid = call i32 @llvm.amdgcn.workitem.id.x()
718 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
719 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
720 %cvt = uitofp <4 x i8> %load to <4 x float>
721 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
725 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
726 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
728 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
729 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
730 ; SI-NEXT: v_mov_b32_e32 v1, 0
731 ; SI-NEXT: s_mov_b32 s6, 0
732 ; SI-NEXT: s_mov_b32 s7, 0xf000
733 ; SI-NEXT: s_waitcnt lgkmcnt(0)
734 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
735 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
736 ; SI-NEXT: s_mov_b32 s6, -1
737 ; SI-NEXT: s_waitcnt lgkmcnt(0)
738 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
739 ; SI-NEXT: s_waitcnt vmcnt(0)
740 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
741 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
742 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v0
743 ; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0
744 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
745 ; SI-NEXT: v_add_i32_e32 v6, vcc, 9, v0
746 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
747 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v1
748 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v2
749 ; SI-NEXT: v_add_i32_e32 v8, vcc, 9, v1
750 ; SI-NEXT: v_add_i32_e32 v9, vcc, 9, v2
751 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
752 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
753 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v8
754 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
755 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
756 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v9
757 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
758 ; SI-NEXT: s_waitcnt expcnt(0)
759 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
760 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
761 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
762 ; SI-NEXT: v_or_b32_e32 v0, v6, v0
763 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4
764 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
765 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
766 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
767 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
770 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
772 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
773 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
774 ; VI-NEXT: v_mov_b32_e32 v6, 8
775 ; VI-NEXT: s_waitcnt lgkmcnt(0)
776 ; VI-NEXT: v_mov_b32_e32 v0, s2
777 ; VI-NEXT: v_mov_b32_e32 v1, s3
778 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
779 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
780 ; VI-NEXT: flat_load_dword v1, v[0:1]
781 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
782 ; VI-NEXT: v_mov_b32_e32 v2, 9
783 ; VI-NEXT: s_waitcnt lgkmcnt(0)
784 ; VI-NEXT: v_mov_b32_e32 v5, s1
785 ; VI-NEXT: v_mov_b32_e32 v4, s0
786 ; VI-NEXT: s_waitcnt vmcnt(0)
787 ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
788 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
789 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
790 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
791 ; VI-NEXT: v_add_u16_e32 v9, 9, v1
792 ; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
793 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
794 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
795 ; VI-NEXT: v_add_u16_e32 v7, 9, v7
796 ; VI-NEXT: v_add_u16_e32 v8, 9, v8
797 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
798 ; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
799 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
800 ; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
801 ; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
802 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
803 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
804 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
805 ; VI-NEXT: v_or_b32_e32 v2, v0, v2
806 ; VI-NEXT: v_mov_b32_e32 v0, s2
807 ; VI-NEXT: v_mov_b32_e32 v1, s3
808 ; VI-NEXT: flat_store_dword v[0:1], v2
810 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
811 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
812 %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
813 %cvt = uitofp <4 x i8> %load to <4 x float>
814 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
815 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
816 store <4 x i8> %add, ptr addrspace(1) %out2, align 4
820 define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
821 ; SI-LABEL: load_v7i8_to_v7f32:
823 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
824 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
825 ; SI-NEXT: v_mov_b32_e32 v1, 0
826 ; SI-NEXT: s_mov_b32 s6, 0
827 ; SI-NEXT: s_mov_b32 s7, 0xf000
828 ; SI-NEXT: s_waitcnt lgkmcnt(0)
829 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
830 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64
831 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1
832 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
833 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3
834 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4
835 ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5
836 ; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6
837 ; SI-NEXT: s_mov_b32 s6, -1
838 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
839 ; SI-NEXT: s_waitcnt vmcnt(6)
840 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
841 ; SI-NEXT: s_waitcnt vmcnt(5)
842 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
843 ; SI-NEXT: s_waitcnt vmcnt(4)
844 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
845 ; SI-NEXT: s_waitcnt vmcnt(3)
846 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
847 ; SI-NEXT: s_waitcnt vmcnt(2)
848 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
849 ; SI-NEXT: s_waitcnt vmcnt(1)
850 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7
851 ; SI-NEXT: s_waitcnt vmcnt(0)
852 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
853 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
854 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
855 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24
858 ; VI-LABEL: load_v7i8_to_v7f32:
860 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
861 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
862 ; VI-NEXT: s_waitcnt lgkmcnt(0)
863 ; VI-NEXT: v_mov_b32_e32 v0, s2
864 ; VI-NEXT: v_mov_b32_e32 v1, s3
865 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
866 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
867 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
868 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
869 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
870 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
871 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
872 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
873 ; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
874 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
875 ; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0
876 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
877 ; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0
878 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
879 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
880 ; VI-NEXT: flat_load_ubyte v1, v[2:3]
881 ; VI-NEXT: flat_load_ubyte v2, v[4:5]
882 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
883 ; VI-NEXT: flat_load_ubyte v4, v[8:9]
884 ; VI-NEXT: flat_load_ubyte v5, v[10:11]
885 ; VI-NEXT: flat_load_ubyte v6, v[12:13]
886 ; VI-NEXT: v_mov_b32_e32 v8, s1
887 ; VI-NEXT: v_mov_b32_e32 v7, s0
888 ; VI-NEXT: s_add_u32 s0, s0, 16
889 ; VI-NEXT: s_addc_u32 s1, s1, 0
890 ; VI-NEXT: v_mov_b32_e32 v10, s1
891 ; VI-NEXT: v_mov_b32_e32 v9, s0
892 ; VI-NEXT: s_waitcnt vmcnt(6)
893 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
894 ; VI-NEXT: s_waitcnt vmcnt(5)
895 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
896 ; VI-NEXT: s_waitcnt vmcnt(4)
897 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
898 ; VI-NEXT: s_waitcnt vmcnt(3)
899 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
900 ; VI-NEXT: s_waitcnt vmcnt(2)
901 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
902 ; VI-NEXT: s_waitcnt vmcnt(1)
903 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v5
904 ; VI-NEXT: s_waitcnt vmcnt(0)
905 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
906 ; VI-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
907 ; VI-NEXT: flat_store_dwordx3 v[9:10], v[4:6]
909 %tid = call i32 @llvm.amdgcn.workitem.id.x()
910 %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
911 %load = load <7 x i8>, ptr addrspace(1) %gep, align 1
912 %cvt = uitofp <7 x i8> %load to <7 x float>
913 store <7 x float> %cvt, ptr addrspace(1) %out, align 16
917 define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
918 ; SI-LABEL: load_v8i8_to_v8f32:
920 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
921 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
922 ; SI-NEXT: v_mov_b32_e32 v1, 0
923 ; SI-NEXT: s_mov_b32 s6, 0
924 ; SI-NEXT: s_mov_b32 s7, 0xf000
925 ; SI-NEXT: s_waitcnt lgkmcnt(0)
926 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
927 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
928 ; SI-NEXT: s_mov_b32 s6, -1
929 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
930 ; SI-NEXT: s_waitcnt vmcnt(0)
931 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v0
932 ; SI-NEXT: v_bfe_u32 v4, v0, 8, 8
933 ; SI-NEXT: v_bfe_u32 v5, v0, 16, 8
934 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
935 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
936 ; SI-NEXT: v_bfe_u32 v8, v1, 8, 8
937 ; SI-NEXT: v_bfe_u32 v9, v1, 16, 8
938 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v1
939 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
940 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
941 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5
942 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
943 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v8
944 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9
945 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
946 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
949 ; VI-LABEL: load_v8i8_to_v8f32:
951 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
952 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
953 ; VI-NEXT: s_waitcnt lgkmcnt(0)
954 ; VI-NEXT: v_mov_b32_e32 v0, s2
955 ; VI-NEXT: v_mov_b32_e32 v1, s3
956 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
957 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
958 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
959 ; VI-NEXT: v_mov_b32_e32 v9, s1
960 ; VI-NEXT: v_mov_b32_e32 v8, s0
961 ; VI-NEXT: s_add_u32 s0, s0, 16
962 ; VI-NEXT: s_addc_u32 s1, s1, 0
963 ; VI-NEXT: v_mov_b32_e32 v11, s1
964 ; VI-NEXT: v_mov_b32_e32 v10, s0
965 ; VI-NEXT: s_waitcnt vmcnt(0)
966 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
967 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
968 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
969 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v6
970 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
971 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
972 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
973 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v7
974 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
975 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
977 %tid = call i32 @llvm.amdgcn.workitem.id.x()
978 %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
979 %load = load <8 x i8>, ptr addrspace(1) %gep, align 8
980 %cvt = uitofp <8 x i8> %load to <8 x float>
981 store <8 x float> %cvt, ptr addrspace(1) %out, align 16
985 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
986 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
988 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
989 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
990 ; SI-NEXT: v_mov_b32_e32 v1, 0
991 ; SI-NEXT: s_mov_b32 s6, 0
992 ; SI-NEXT: s_mov_b32 s7, 0xf000
993 ; SI-NEXT: s_waitcnt lgkmcnt(0)
994 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
995 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
996 ; SI-NEXT: s_mov_b32 s6, -1
997 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
998 ; SI-NEXT: s_waitcnt vmcnt(0)
999 ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
1000 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1001 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1002 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1005 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
1007 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1008 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1009 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1010 ; VI-NEXT: v_mov_b32_e32 v0, s2
1011 ; VI-NEXT: v_mov_b32_e32 v1, s3
1012 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1013 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1014 ; VI-NEXT: flat_load_dword v0, v[0:1]
1015 ; VI-NEXT: s_waitcnt vmcnt(0)
1016 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
1017 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1018 ; VI-NEXT: v_mov_b32_e32 v0, s0
1019 ; VI-NEXT: v_mov_b32_e32 v1, s1
1020 ; VI-NEXT: flat_store_dword v[0:1], v2
1022 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1023 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1024 %load = load i32, ptr addrspace(1) %gep, align 4
1025 %add = add i32 %load, 2
1026 %inreg = and i32 %add, 255
1027 %cvt = uitofp i32 %inreg to float
1028 store float %cvt, ptr addrspace(1) %out, align 4
1032 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1033 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
1035 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1036 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1037 ; SI-NEXT: v_mov_b32_e32 v1, 0
1038 ; SI-NEXT: s_mov_b32 s6, 0
1039 ; SI-NEXT: s_mov_b32 s7, 0xf000
1040 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1041 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1042 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1043 ; SI-NEXT: s_mov_b32 s6, -1
1044 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1045 ; SI-NEXT: s_waitcnt vmcnt(0)
1046 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
1047 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1048 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1051 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
1053 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1054 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1055 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1056 ; VI-NEXT: v_mov_b32_e32 v0, s2
1057 ; VI-NEXT: v_mov_b32_e32 v1, s3
1058 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1059 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1060 ; VI-NEXT: flat_load_dword v0, v[0:1]
1061 ; VI-NEXT: s_waitcnt vmcnt(0)
1062 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1063 ; VI-NEXT: v_mov_b32_e32 v0, s0
1064 ; VI-NEXT: v_mov_b32_e32 v1, s1
1065 ; VI-NEXT: flat_store_dword v[0:1], v2
1067 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1068 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1069 %load = load i32, ptr addrspace(1) %gep, align 4
1070 %inreg = and i32 %load, 65280
1071 %shr = lshr i32 %inreg, 8
1072 %cvt = uitofp i32 %shr to float
1073 store float %cvt, ptr addrspace(1) %out, align 4
1077 ; We don't get these ones because of the zext, but instcombine removes
1078 ; them so it shouldn't really matter.
1079 define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1080 ; SI-LABEL: i8_zext_i32_to_f32:
1082 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1083 ; SI-NEXT: s_mov_b32 s6, 0
1084 ; SI-NEXT: s_mov_b32 s7, 0xf000
1085 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1086 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1087 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1088 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1089 ; SI-NEXT: s_mov_b32 s6, -1
1090 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1091 ; SI-NEXT: s_waitcnt vmcnt(0)
1092 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1093 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1096 ; VI-LABEL: i8_zext_i32_to_f32:
1098 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1099 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1101 ; VI-NEXT: v_mov_b32_e32 v1, s2
1102 ; VI-NEXT: v_mov_b32_e32 v2, s3
1103 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
1104 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
1105 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1106 ; VI-NEXT: s_waitcnt vmcnt(0)
1107 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
1108 ; VI-NEXT: v_mov_b32_e32 v0, s0
1109 ; VI-NEXT: v_mov_b32_e32 v1, s1
1110 ; VI-NEXT: flat_store_dword v[0:1], v2
1112 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1113 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
1114 %load = load i8, ptr addrspace(1) %gep, align 1
1115 %ext = zext i8 %load to i32
1116 %cvt = uitofp i32 %ext to float
1117 store float %cvt, ptr addrspace(1) %out, align 4
1121 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1122 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
1124 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1125 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1126 ; SI-NEXT: v_mov_b32_e32 v1, 0
1127 ; SI-NEXT: s_mov_b32 s6, 0
1128 ; SI-NEXT: s_mov_b32 s7, 0xf000
1129 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1130 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1131 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
1132 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
1133 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
1134 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1135 ; SI-NEXT: s_mov_b32 s6, -1
1136 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1137 ; SI-NEXT: s_waitcnt vmcnt(3)
1138 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1139 ; SI-NEXT: s_waitcnt vmcnt(2)
1140 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1141 ; SI-NEXT: s_waitcnt vmcnt(1)
1142 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1143 ; SI-NEXT: s_waitcnt vmcnt(0)
1144 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1145 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
1146 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1147 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
1148 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
1149 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
1150 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1151 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
1152 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
1153 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1154 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1157 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
1159 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1160 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1161 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1162 ; VI-NEXT: v_mov_b32_e32 v0, s2
1163 ; VI-NEXT: v_mov_b32_e32 v1, s3
1164 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1165 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1166 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
1167 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1168 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
1169 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1170 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
1171 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1172 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1173 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
1174 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
1175 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1176 ; VI-NEXT: s_waitcnt vmcnt(3)
1177 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1178 ; VI-NEXT: s_waitcnt vmcnt(2)
1179 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1180 ; VI-NEXT: s_waitcnt vmcnt(1)
1181 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1182 ; VI-NEXT: s_waitcnt vmcnt(0)
1183 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1184 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
1185 ; VI-NEXT: v_or_b32_e32 v3, v1, v0
1186 ; VI-NEXT: v_mov_b32_e32 v5, s1
1187 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1188 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1189 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1190 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
1191 ; VI-NEXT: v_mov_b32_e32 v4, s0
1192 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1194 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1195 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1196 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1197 %ext = zext <4 x i8> %load to <4 x i32>
1198 %cvt = uitofp <4 x i32> %ext to <4 x float>
1199 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1203 define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1204 ; SI-LABEL: extract_byte0_to_f32:
1206 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1207 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1208 ; SI-NEXT: v_mov_b32_e32 v1, 0
1209 ; SI-NEXT: s_mov_b32 s6, 0
1210 ; SI-NEXT: s_mov_b32 s7, 0xf000
1211 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1212 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1213 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1214 ; SI-NEXT: s_mov_b32 s6, -1
1215 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1216 ; SI-NEXT: s_waitcnt vmcnt(0)
1217 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1218 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1221 ; VI-LABEL: extract_byte0_to_f32:
1223 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1224 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1225 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1226 ; VI-NEXT: v_mov_b32_e32 v0, s2
1227 ; VI-NEXT: v_mov_b32_e32 v1, s3
1228 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1229 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1230 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1231 ; VI-NEXT: s_waitcnt vmcnt(0)
1232 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
1233 ; VI-NEXT: v_mov_b32_e32 v0, s0
1234 ; VI-NEXT: v_mov_b32_e32 v1, s1
1235 ; VI-NEXT: flat_store_dword v[0:1], v2
1237 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1238 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1239 %val = load i32, ptr addrspace(1) %gep
1240 %and = and i32 %val, 255
1241 %cvt = uitofp i32 %and to float
1242 store float %cvt, ptr addrspace(1) %out
1246 define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1247 ; SI-LABEL: extract_byte1_to_f32:
1249 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1250 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1251 ; SI-NEXT: v_mov_b32_e32 v1, 0
1252 ; SI-NEXT: s_mov_b32 s6, 0
1253 ; SI-NEXT: s_mov_b32 s7, 0xf000
1254 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1255 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1256 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1257 ; SI-NEXT: s_mov_b32 s6, -1
1258 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1259 ; SI-NEXT: s_waitcnt vmcnt(0)
1260 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
1261 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1262 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1265 ; VI-LABEL: extract_byte1_to_f32:
1267 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1268 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1269 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1270 ; VI-NEXT: v_mov_b32_e32 v0, s2
1271 ; VI-NEXT: v_mov_b32_e32 v1, s3
1272 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1273 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274 ; VI-NEXT: flat_load_dword v0, v[0:1]
1275 ; VI-NEXT: s_waitcnt vmcnt(0)
1276 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1277 ; VI-NEXT: v_mov_b32_e32 v0, s0
1278 ; VI-NEXT: v_mov_b32_e32 v1, s1
1279 ; VI-NEXT: flat_store_dword v[0:1], v2
1281 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1282 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1283 %val = load i32, ptr addrspace(1) %gep
1284 %srl = lshr i32 %val, 8
1285 %and = and i32 %srl, 255
1286 %cvt = uitofp i32 %and to float
1287 store float %cvt, ptr addrspace(1) %out
1291 define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1292 ; SI-LABEL: extract_byte2_to_f32:
1294 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1295 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1296 ; SI-NEXT: v_mov_b32_e32 v1, 0
1297 ; SI-NEXT: s_mov_b32 s6, 0
1298 ; SI-NEXT: s_mov_b32 s7, 0xf000
1299 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1300 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1301 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1302 ; SI-NEXT: s_mov_b32 s6, -1
1303 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1304 ; SI-NEXT: s_waitcnt vmcnt(0)
1305 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
1306 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1307 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1310 ; VI-LABEL: extract_byte2_to_f32:
1312 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1313 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1314 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1315 ; VI-NEXT: v_mov_b32_e32 v0, s2
1316 ; VI-NEXT: v_mov_b32_e32 v1, s3
1317 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1318 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1319 ; VI-NEXT: flat_load_dword v0, v[0:1]
1320 ; VI-NEXT: s_waitcnt vmcnt(0)
1321 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1322 ; VI-NEXT: v_mov_b32_e32 v0, s0
1323 ; VI-NEXT: v_mov_b32_e32 v1, s1
1324 ; VI-NEXT: flat_store_dword v[0:1], v2
1326 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1327 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1328 %val = load i32, ptr addrspace(1) %gep
1329 %srl = lshr i32 %val, 16
1330 %and = and i32 %srl, 255
1331 %cvt = uitofp i32 %and to float
1332 store float %cvt, ptr addrspace(1) %out
1336 define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1337 ; SI-LABEL: extract_byte3_to_f32:
1339 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1340 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1341 ; SI-NEXT: v_mov_b32_e32 v1, 0
1342 ; SI-NEXT: s_mov_b32 s6, 0
1343 ; SI-NEXT: s_mov_b32 s7, 0xf000
1344 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1345 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1346 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1347 ; SI-NEXT: s_mov_b32 s6, -1
1348 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1349 ; SI-NEXT: s_waitcnt vmcnt(0)
1350 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
1351 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1354 ; VI-LABEL: extract_byte3_to_f32:
1356 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1357 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1358 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1359 ; VI-NEXT: v_mov_b32_e32 v0, s2
1360 ; VI-NEXT: v_mov_b32_e32 v1, s3
1361 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1362 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1363 ; VI-NEXT: flat_load_dword v0, v[0:1]
1364 ; VI-NEXT: s_waitcnt vmcnt(0)
1365 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0
1366 ; VI-NEXT: v_mov_b32_e32 v0, s0
1367 ; VI-NEXT: v_mov_b32_e32 v1, s1
1368 ; VI-NEXT: flat_store_dword v[0:1], v2
1370 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1371 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1372 %val = load i32, ptr addrspace(1) %gep
1373 %srl = lshr i32 %val, 24
1374 %and = and i32 %srl, 255
1375 %cvt = uitofp i32 %and to float
1376 store float %cvt, ptr addrspace(1) %out
1380 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) {
1381 ; SI-LABEL: cvt_ubyte0_or_multiuse:
1382 ; SI: ; %bb.0: ; %bb
1383 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1384 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1385 ; SI-NEXT: v_mov_b32_e32 v1, 0
1386 ; SI-NEXT: s_mov_b32 s6, 0
1387 ; SI-NEXT: s_mov_b32 s7, 0xf000
1388 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1389 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
1390 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1391 ; SI-NEXT: s_mov_b32 s6, -1
1392 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1393 ; SI-NEXT: s_waitcnt vmcnt(0)
1394 ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1395 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
1396 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1397 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
1398 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1401 ; VI-LABEL: cvt_ubyte0_or_multiuse:
1402 ; VI: ; %bb.0: ; %bb
1403 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1404 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1405 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1406 ; VI-NEXT: v_mov_b32_e32 v0, s0
1407 ; VI-NEXT: v_mov_b32_e32 v1, s1
1408 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1409 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1410 ; VI-NEXT: flat_load_dword v0, v[0:1]
1411 ; VI-NEXT: s_waitcnt vmcnt(0)
1412 ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1413 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1414 ; VI-NEXT: v_add_f32_e32 v2, v0, v1
1415 ; VI-NEXT: v_mov_b32_e32 v0, s2
1416 ; VI-NEXT: v_mov_b32_e32 v1, s3
1417 ; VI-NEXT: flat_store_dword v[0:1], v2
1420 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1421 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid
1422 %load = load i32, ptr addrspace(1) %gep
1423 %or = or i32 %load, -2147483647
1424 %and = and i32 %or, 255
1425 %uitofp = uitofp i32 %and to float
1426 %cast = bitcast i32 %or to float
1427 %add = fadd float %cast, %uitofp
1428 store float %add, ptr addrspace(1) %out
1432 define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
1433 ; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
1435 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1436 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1437 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
1438 ; SI-NEXT: v_ffbh_i32_e32 v3, 0
1439 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
1440 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v3
1441 ; SI-NEXT: v_mov_b32_e32 v1, 0
1442 ; SI-NEXT: v_min_u32_e32 v2, v3, v2
1443 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
1444 ; SI-NEXT: v_min_u32_e32 v0, 1, v0
1445 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1446 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
1447 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
1448 ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
1449 ; SI-NEXT: s_setpc_b64 s[30:31]
1451 ; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
1453 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1454 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1455 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
1456 ; VI-NEXT: v_ffbh_i32_e32 v3, 0
1457 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
1458 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v3
1459 ; VI-NEXT: v_mov_b32_e32 v1, 0
1460 ; VI-NEXT: v_min_u32_e32 v2, v3, v2
1461 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
1462 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1463 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1464 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
1465 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
1466 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
1467 ; VI-NEXT: s_setpc_b64 s[30:31]
1468 %masked = and i64 %arg0, 255
1469 %itofp = sitofp i64 %masked to float
1473 define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
1474 ; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
1476 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1477 ; SI-NEXT: v_ffbh_u32_e32 v2, 0
1478 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1479 ; SI-NEXT: v_mov_b32_e32 v1, 0
1480 ; SI-NEXT: v_min_u32_e32 v2, 32, v2
1481 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
1482 ; SI-NEXT: v_min_u32_e32 v0, 1, v0
1483 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1484 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0
1485 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
1486 ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
1487 ; SI-NEXT: s_setpc_b64 s[30:31]
1489 ; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
1491 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1492 ; VI-NEXT: v_ffbh_u32_e32 v2, 0
1493 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1494 ; VI-NEXT: v_mov_b32_e32 v1, 0
1495 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1496 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
1497 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1498 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1499 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1500 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
1501 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
1502 ; VI-NEXT: s_setpc_b64 s[30:31]
1503 %masked = and i64 %arg0, 255
1504 %itofp = uitofp i64 %masked to float
1508 define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
1509 ; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
1511 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1512 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1513 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
1514 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1515 ; SI-NEXT: s_setpc_b64 s[30:31]
1517 ; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
1519 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1520 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1521 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1522 ; VI-NEXT: s_setpc_b64 s[30:31]
1523 %masked = and i16 %arg0, 255
1524 %itofp = sitofp i16 %masked to float
1528 define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
1529 ; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
1531 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1532 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1533 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1534 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1535 ; SI-NEXT: s_setpc_b64 s[30:31]
1537 ; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
1539 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1540 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
1541 ; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1542 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1543 ; VI-NEXT: s_setpc_b64 s[30:31]
1544 %masked = and i16 %arg0, 255
1545 %itofp = uitofp i16 %masked to float