1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
8 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9 ; SI-LABEL: v_uitofp_i32_to_f32_mask255:
11 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
13 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
14 ; SI-NEXT: s_setpc_b64 s[30:31]
16 ; VI-LABEL: v_uitofp_i32_to_f32_mask255:
18 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
20 ; VI-NEXT: s_setpc_b64 s[30:31]
21 %masked = and i32 %arg0, 255
22 %cvt = uitofp i32 %masked to float
26 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
27 ; SI-LABEL: v_sitofp_i32_to_f32_mask255:
29 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
31 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
32 ; SI-NEXT: s_setpc_b64 s[30:31]
34 ; VI-LABEL: v_sitofp_i32_to_f32_mask255:
36 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
38 ; VI-NEXT: s_setpc_b64 s[30:31]
39 %masked = and i32 %arg0, 255
40 %cvt = sitofp i32 %masked to float
44 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
45 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
47 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GCN-NEXT: v_bfe_u32 v0, v0, 7, 8
49 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
50 ; GCN-NEXT: s_setpc_b64 s[30:31]
51 %lshr.7 = lshr i32 %arg0, 7
52 %masked = and i32 %lshr.7, 255
53 %cvt = uitofp i32 %masked to float
57 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
58 ; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
60 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
62 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
63 ; SI-NEXT: s_setpc_b64 s[30:31]
65 ; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
67 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
69 ; VI-NEXT: s_setpc_b64 s[30:31]
70 %lshr.8 = lshr i32 %arg0, 8
71 %masked = and i32 %lshr.8, 255
72 %cvt = uitofp i32 %masked to float
76 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
77 ; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
79 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
81 ; SI-NEXT: s_mov_b32 s6, -1
82 ; SI-NEXT: s_mov_b32 s7, 0xf000
83 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
84 ; SI-NEXT: s_waitcnt expcnt(0)
85 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
86 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
87 ; SI-NEXT: s_waitcnt vmcnt(0)
88 ; SI-NEXT: s_setpc_b64 s[30:31]
90 ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
92 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
94 ; VI-NEXT: flat_store_dword v[0:1], v0
95 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
96 ; VI-NEXT: s_waitcnt vmcnt(0)
97 ; VI-NEXT: s_setpc_b64 s[30:31]
98 %lshr.8 = lshr i32 %arg0, 8
99 store i32 %lshr.8, ptr addrspace(1) undef
100 %masked = and i32 %lshr.8, 255
101 %cvt = uitofp i32 %masked to float
105 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
106 ; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
108 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
110 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
111 ; SI-NEXT: s_setpc_b64 s[30:31]
113 ; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
115 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
117 ; VI-NEXT: s_setpc_b64 s[30:31]
118 %lshr.16 = lshr i32 %arg0, 16
119 %masked = and i32 %lshr.16, 255
120 %cvt = uitofp i32 %masked to float
124 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
125 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
127 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
129 ; GCN-NEXT: s_setpc_b64 s[30:31]
130 %lshr.16 = lshr i32 %arg0, 24
131 %masked = and i32 %lshr.16, 255
132 %cvt = uitofp i32 %masked to float
136 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
137 ; SI-LABEL: v_uitofp_i8_to_f32:
139 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
141 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
142 ; SI-NEXT: s_setpc_b64 s[30:31]
144 ; VI-LABEL: v_uitofp_i8_to_f32:
146 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
148 ; VI-NEXT: s_setpc_b64 s[30:31]
149 %cvt = uitofp i8 %arg0 to float
153 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
154 ; SI-LABEL: v_uitofp_v2i8_to_v2f32:
156 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
158 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
159 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
160 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
161 ; SI-NEXT: v_mov_b32_e32 v0, v2
162 ; SI-NEXT: s_setpc_b64 s[30:31]
164 ; VI-LABEL: v_uitofp_v2i8_to_v2f32:
166 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
168 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
169 ; VI-NEXT: v_mov_b32_e32 v0, v2
170 ; VI-NEXT: s_setpc_b64 s[30:31]
171 %val = bitcast i16 %arg0 to <2 x i8>
172 %cvt = uitofp <2 x i8> %val to <2 x float>
176 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
177 ; SI-LABEL: v_uitofp_v3i8_to_v3f32:
179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
181 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
182 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
183 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
184 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
185 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
186 ; SI-NEXT: v_mov_b32_e32 v0, v3
187 ; SI-NEXT: s_setpc_b64 s[30:31]
189 ; VI-LABEL: v_uitofp_v3i8_to_v3f32:
191 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
193 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
194 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
195 ; VI-NEXT: v_mov_b32_e32 v0, v3
196 ; VI-NEXT: s_setpc_b64 s[30:31]
197 %trunc = trunc i32 %arg0 to i24
198 %val = bitcast i24 %trunc to <3 x i8>
199 %cvt = uitofp <3 x i8> %val to <3 x float>
203 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
204 ; SI-LABEL: v_uitofp_v4i8_to_v4f32:
206 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
208 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
209 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
210 ; SI-NEXT: v_bfe_u32 v2, v0, 16, 8
211 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
212 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
213 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
214 ; SI-NEXT: v_mov_b32_e32 v0, v4
215 ; SI-NEXT: s_setpc_b64 s[30:31]
217 ; VI-LABEL: v_uitofp_v4i8_to_v4f32:
219 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
221 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
222 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
223 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
224 ; VI-NEXT: v_mov_b32_e32 v0, v4
225 ; VI-NEXT: s_setpc_b64 s[30:31]
226 %val = bitcast i32 %arg0 to <4 x i8>
227 %cvt = uitofp <4 x i8> %val to <4 x float>
231 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
232 ; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
234 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
236 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
237 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
238 ; SI-NEXT: v_bfe_u32 v2, v0, 16, 8
239 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
240 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
241 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
242 ; SI-NEXT: v_mov_b32_e32 v0, v4
243 ; SI-NEXT: s_setpc_b64 s[30:31]
245 ; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
247 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
249 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
250 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
251 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
252 ; VI-NEXT: v_mov_b32_e32 v0, v4
253 ; VI-NEXT: s_setpc_b64 s[30:31]
254 %mask.arg0 = and i32 %arg0, 255
255 %cvt0 = uitofp i32 %mask.arg0 to float
257 %lshr.8 = lshr i32 %arg0, 8
258 %mask.lshr.8 = and i32 %lshr.8, 255
259 %cvt1 = uitofp i32 %mask.lshr.8 to float
261 %lshr.16 = lshr i32 %arg0, 16
262 %mask.lshr.16 = and i32 %lshr.16, 255
263 %cvt2 = uitofp i32 %mask.lshr.16 to float
265 %lshr.24 = lshr i32 %arg0, 24
266 %mask.lshr.24 = and i32 %lshr.24, 255
267 %cvt3 = uitofp i32 %mask.lshr.24 to float
269 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
270 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
271 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
272 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
273 ret <4 x float> %ins.3
276 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
277 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
279 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
281 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
282 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
283 ; SI-NEXT: s_setpc_b64 s[30:31]
285 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
287 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
289 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
290 ; VI-NEXT: s_setpc_b64 s[30:31]
291 %masked = and i32 %arg0, 255
292 %cvt = uitofp i32 %masked to half
296 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
297 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
299 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
301 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
302 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
303 ; SI-NEXT: s_setpc_b64 s[30:31]
305 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
307 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
309 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
310 ; VI-NEXT: s_setpc_b64 s[30:31]
311 %masked = and i32 %arg0, 255
312 %cvt = sitofp i32 %masked to half
316 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
317 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
319 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
321 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
322 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
323 ; SI-NEXT: s_setpc_b64 s[30:31]
325 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
327 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
329 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
330 ; VI-NEXT: s_setpc_b64 s[30:31]
331 %lshr.8 = lshr i32 %arg0, 8
332 %masked = and i32 %lshr.8, 255
333 %cvt = uitofp i32 %masked to half
337 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
338 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
340 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
342 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
343 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
344 ; SI-NEXT: s_setpc_b64 s[30:31]
346 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
348 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
350 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
351 ; VI-NEXT: s_setpc_b64 s[30:31]
352 %lshr.16 = lshr i32 %arg0, 16
353 %masked = and i32 %lshr.16, 255
354 %cvt = uitofp i32 %masked to half
358 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
359 ; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
361 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
363 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
364 ; GCN-NEXT: s_setpc_b64 s[30:31]
365 %lshr.16 = lshr i32 %arg0, 24
366 %masked = and i32 %lshr.16, 255
367 %cvt = uitofp i32 %masked to half
371 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
372 ; SI-LABEL: v_uitofp_i8_to_f16:
374 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
376 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
377 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
378 ; SI-NEXT: s_setpc_b64 s[30:31]
380 ; VI-LABEL: v_uitofp_i8_to_f16:
382 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
384 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
385 ; VI-NEXT: s_setpc_b64 s[30:31]
386 %cvt = uitofp i8 %arg0 to half
390 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
391 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
393 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
395 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
396 ; GCN-NEXT: s_setpc_b64 s[30:31]
397 %masked = and i32 %arg0, 255
398 %cvt = uitofp i32 %masked to double
402 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
403 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
405 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8
407 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
408 ; GCN-NEXT: s_setpc_b64 s[30:31]
409 %lshr.8 = lshr i32 %arg0, 8
410 %masked = and i32 %lshr.8, 255
411 %cvt = uitofp i32 %masked to double
415 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
416 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
418 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8
420 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
421 ; GCN-NEXT: s_setpc_b64 s[30:31]
422 %lshr.16 = lshr i32 %arg0, 16
423 %masked = and i32 %lshr.16, 255
424 %cvt = uitofp i32 %masked to double
428 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
429 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
431 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0
433 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
434 ; GCN-NEXT: s_setpc_b64 s[30:31]
435 %lshr.16 = lshr i32 %arg0, 24
436 %masked = and i32 %lshr.16, 255
437 %cvt = uitofp i32 %masked to double
441 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
442 ; GCN-LABEL: v_uitofp_i8_to_f64:
444 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
446 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
447 ; GCN-NEXT: s_setpc_b64 s[30:31]
448 %cvt = uitofp i8 %arg0 to double
452 define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
453 ; SI-LABEL: load_i8_to_f32:
455 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
456 ; SI-NEXT: s_mov_b32 s6, 0
457 ; SI-NEXT: s_mov_b32 s7, 0xf000
458 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
459 ; SI-NEXT: s_waitcnt lgkmcnt(0)
460 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
461 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
462 ; SI-NEXT: s_mov_b32 s6, -1
463 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
464 ; SI-NEXT: s_waitcnt vmcnt(0)
465 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
466 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
469 ; VI-LABEL: load_i8_to_f32:
471 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
472 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
473 ; VI-NEXT: s_waitcnt lgkmcnt(0)
474 ; VI-NEXT: v_mov_b32_e32 v1, s2
475 ; VI-NEXT: v_mov_b32_e32 v2, s3
476 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
477 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
478 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
479 ; VI-NEXT: s_waitcnt vmcnt(0)
480 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
481 ; VI-NEXT: v_mov_b32_e32 v0, s0
482 ; VI-NEXT: v_mov_b32_e32 v1, s1
483 ; VI-NEXT: flat_store_dword v[0:1], v2
485 %tid = call i32 @llvm.amdgcn.workitem.id.x()
486 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
487 %load = load i8, ptr addrspace(1) %gep, align 1
488 %cvt = uitofp i8 %load to float
489 store float %cvt, ptr addrspace(1) %out, align 4
493 define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
494 ; SI-LABEL: load_v2i8_to_v2f32:
496 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
497 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
498 ; SI-NEXT: v_mov_b32_e32 v1, 0
499 ; SI-NEXT: s_mov_b32 s6, 0
500 ; SI-NEXT: s_mov_b32 s7, 0xf000
501 ; SI-NEXT: s_waitcnt lgkmcnt(0)
502 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
503 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
504 ; SI-NEXT: s_mov_b32 s6, -1
505 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
506 ; SI-NEXT: s_waitcnt vmcnt(0)
507 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
508 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
509 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
510 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
511 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
514 ; VI-LABEL: load_v2i8_to_v2f32:
516 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
517 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
518 ; VI-NEXT: s_waitcnt lgkmcnt(0)
519 ; VI-NEXT: v_mov_b32_e32 v0, s2
520 ; VI-NEXT: v_mov_b32_e32 v1, s3
521 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
522 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
523 ; VI-NEXT: flat_load_ushort v1, v[0:1]
524 ; VI-NEXT: v_mov_b32_e32 v3, s1
525 ; VI-NEXT: v_mov_b32_e32 v2, s0
526 ; VI-NEXT: s_waitcnt vmcnt(0)
527 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
528 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
529 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
531 %tid = call i32 @llvm.amdgcn.workitem.id.x()
532 %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
533 %load = load <2 x i8>, ptr addrspace(1) %gep, align 2
534 %cvt = uitofp <2 x i8> %load to <2 x float>
535 store <2 x float> %cvt, ptr addrspace(1) %out, align 16
539 define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
540 ; SI-LABEL: load_v3i8_to_v3f32:
542 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
543 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
544 ; SI-NEXT: v_mov_b32_e32 v1, 0
545 ; SI-NEXT: s_mov_b32 s6, 0
546 ; SI-NEXT: s_mov_b32 s7, 0xf000
547 ; SI-NEXT: s_waitcnt lgkmcnt(0)
548 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
549 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
550 ; SI-NEXT: s_mov_b32 s6, -1
551 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
552 ; SI-NEXT: s_waitcnt vmcnt(0)
553 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
554 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
555 ; SI-NEXT: v_bfe_u32 v3, v0, 16, 8
556 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
557 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
558 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
559 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
560 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
563 ; VI-LABEL: load_v3i8_to_v3f32:
565 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
566 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
567 ; VI-NEXT: s_waitcnt lgkmcnt(0)
568 ; VI-NEXT: v_mov_b32_e32 v0, s2
569 ; VI-NEXT: v_mov_b32_e32 v1, s3
570 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
571 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
572 ; VI-NEXT: flat_load_dword v2, v[0:1]
573 ; VI-NEXT: v_mov_b32_e32 v4, s1
574 ; VI-NEXT: v_mov_b32_e32 v3, s0
575 ; VI-NEXT: s_waitcnt vmcnt(0)
576 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
577 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
578 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
579 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
581 %tid = call i32 @llvm.amdgcn.workitem.id.x()
582 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
583 %load = load <3 x i8>, ptr addrspace(1) %gep, align 4
584 %cvt = uitofp <3 x i8> %load to <3 x float>
585 store <3 x float> %cvt, ptr addrspace(1) %out, align 16
589 define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
590 ; SI-LABEL: load_v4i8_to_v4f32:
592 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
593 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
594 ; SI-NEXT: v_mov_b32_e32 v1, 0
595 ; SI-NEXT: s_mov_b32 s6, 0
596 ; SI-NEXT: s_mov_b32 s7, 0xf000
597 ; SI-NEXT: s_waitcnt lgkmcnt(0)
598 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
599 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
600 ; SI-NEXT: s_mov_b32 s6, -1
601 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
602 ; SI-NEXT: s_waitcnt vmcnt(0)
603 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
604 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
605 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
606 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
607 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
608 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
609 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
610 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
613 ; VI-LABEL: load_v4i8_to_v4f32:
615 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
616 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
617 ; VI-NEXT: s_waitcnt lgkmcnt(0)
618 ; VI-NEXT: v_mov_b32_e32 v0, s2
619 ; VI-NEXT: v_mov_b32_e32 v1, s3
620 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
621 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
622 ; VI-NEXT: flat_load_dword v3, v[0:1]
623 ; VI-NEXT: v_mov_b32_e32 v5, s1
624 ; VI-NEXT: v_mov_b32_e32 v4, s0
625 ; VI-NEXT: s_waitcnt vmcnt(0)
626 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
627 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
628 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
629 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
630 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
632 %tid = call i32 @llvm.amdgcn.workitem.id.x()
633 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
634 %load = load <4 x i8>, ptr addrspace(1) %gep, align 4
635 %cvt = uitofp <4 x i8> %load to <4 x float>
636 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
640 ; This should not be adding instructions to shift into the correct
641 ; position in the word for the component.
643 ; FIXME: Packing bytes
644 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
645 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
647 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
648 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
649 ; SI-NEXT: v_mov_b32_e32 v1, 0
650 ; SI-NEXT: s_mov_b32 s6, 0
651 ; SI-NEXT: s_mov_b32 s7, 0xf000
652 ; SI-NEXT: s_waitcnt lgkmcnt(0)
653 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
654 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
655 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
656 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
657 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
658 ; SI-NEXT: s_mov_b32 s6, -1
659 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
660 ; SI-NEXT: s_waitcnt vmcnt(3)
661 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
662 ; SI-NEXT: s_waitcnt vmcnt(2)
663 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
664 ; SI-NEXT: s_waitcnt vmcnt(1)
665 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
666 ; SI-NEXT: s_waitcnt vmcnt(0)
667 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
668 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
669 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
670 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
671 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
672 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
673 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
674 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
675 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
676 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
677 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
680 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
682 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
683 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
684 ; VI-NEXT: s_waitcnt lgkmcnt(0)
685 ; VI-NEXT: v_mov_b32_e32 v0, s2
686 ; VI-NEXT: v_mov_b32_e32 v1, s3
687 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
688 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
689 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
690 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
691 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
692 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
693 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
694 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
695 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
696 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
697 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
698 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
699 ; VI-NEXT: s_waitcnt vmcnt(3)
700 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
701 ; VI-NEXT: s_waitcnt vmcnt(2)
702 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
703 ; VI-NEXT: s_waitcnt vmcnt(1)
704 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
705 ; VI-NEXT: s_waitcnt vmcnt(0)
706 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
707 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
708 ; VI-NEXT: v_or_b32_e32 v3, v1, v0
709 ; VI-NEXT: v_mov_b32_e32 v5, s1
710 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
711 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
712 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
713 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
714 ; VI-NEXT: v_mov_b32_e32 v4, s0
715 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
717 %tid = call i32 @llvm.amdgcn.workitem.id.x()
718 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
719 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
720 %cvt = uitofp <4 x i8> %load to <4 x float>
721 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
725 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
726 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
728 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
729 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
730 ; SI-NEXT: v_mov_b32_e32 v1, 0
731 ; SI-NEXT: s_mov_b32 s6, 0
732 ; SI-NEXT: s_mov_b32 s7, 0xf000
733 ; SI-NEXT: s_waitcnt lgkmcnt(0)
734 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
735 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
736 ; SI-NEXT: s_mov_b32 s6, -1
737 ; SI-NEXT: s_waitcnt lgkmcnt(0)
738 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
739 ; SI-NEXT: s_waitcnt vmcnt(0)
740 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
741 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
742 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v0
743 ; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0
744 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
745 ; SI-NEXT: v_add_i32_e32 v6, vcc, 9, v0
746 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
747 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v1
748 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v2
749 ; SI-NEXT: v_add_i32_e32 v8, vcc, 9, v1
750 ; SI-NEXT: v_add_i32_e32 v9, vcc, 9, v2
751 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
752 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
753 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v8
754 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
755 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
756 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v9
757 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
758 ; SI-NEXT: s_waitcnt expcnt(0)
759 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
760 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
761 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
762 ; SI-NEXT: v_or_b32_e32 v0, v6, v0
763 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4
764 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
765 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
766 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
767 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
770 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
772 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
773 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
774 ; VI-NEXT: v_mov_b32_e32 v6, 9
775 ; VI-NEXT: v_mov_b32_e32 v7, 8
776 ; VI-NEXT: s_waitcnt lgkmcnt(0)
777 ; VI-NEXT: v_mov_b32_e32 v0, s0
778 ; VI-NEXT: v_mov_b32_e32 v1, s1
779 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
780 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
781 ; VI-NEXT: flat_load_dword v1, v[0:1]
782 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
783 ; VI-NEXT: v_mov_b32_e32 v2, 0xff
784 ; VI-NEXT: s_waitcnt lgkmcnt(0)
785 ; VI-NEXT: v_mov_b32_e32 v5, s1
786 ; VI-NEXT: v_mov_b32_e32 v4, s0
787 ; VI-NEXT: s_waitcnt vmcnt(0)
788 ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
789 ; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
790 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
791 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
792 ; VI-NEXT: v_add_u16_e32 v9, 9, v1
793 ; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
794 ; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
795 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
796 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
797 ; VI-NEXT: v_add_u16_e32 v8, 9, v8
798 ; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
799 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
800 ; VI-NEXT: v_and_b32_e32 v6, 0xff, v6
801 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
802 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10
803 ; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
804 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
805 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
806 ; VI-NEXT: v_or_b32_e32 v2, v0, v2
807 ; VI-NEXT: v_mov_b32_e32 v0, s2
808 ; VI-NEXT: v_mov_b32_e32 v1, s3
809 ; VI-NEXT: flat_store_dword v[0:1], v2
811 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
812 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
813 %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
814 %cvt = uitofp <4 x i8> %load to <4 x float>
815 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
816 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
817 store <4 x i8> %add, ptr addrspace(1) %out2, align 4
821 define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
822 ; SI-LABEL: load_v7i8_to_v7f32:
824 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
825 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
826 ; SI-NEXT: v_mov_b32_e32 v1, 0
827 ; SI-NEXT: s_mov_b32 s6, 0
828 ; SI-NEXT: s_mov_b32 s7, 0xf000
829 ; SI-NEXT: s_waitcnt lgkmcnt(0)
830 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
831 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64
832 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1
833 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
834 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3
835 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4
836 ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5
837 ; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6
838 ; SI-NEXT: s_mov_b32 s6, -1
839 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
840 ; SI-NEXT: s_waitcnt vmcnt(6)
841 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
842 ; SI-NEXT: s_waitcnt vmcnt(5)
843 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
844 ; SI-NEXT: s_waitcnt vmcnt(4)
845 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
846 ; SI-NEXT: s_waitcnt vmcnt(3)
847 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
848 ; SI-NEXT: s_waitcnt vmcnt(2)
849 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
850 ; SI-NEXT: s_waitcnt vmcnt(1)
851 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7
852 ; SI-NEXT: s_waitcnt vmcnt(0)
853 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
854 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
855 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
856 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24
859 ; VI-LABEL: load_v7i8_to_v7f32:
861 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
862 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
863 ; VI-NEXT: s_waitcnt lgkmcnt(0)
864 ; VI-NEXT: v_mov_b32_e32 v0, s2
865 ; VI-NEXT: v_mov_b32_e32 v1, s3
866 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
867 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
868 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
869 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
870 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
871 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
872 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
873 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
874 ; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
875 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
876 ; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0
877 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
878 ; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0
879 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
880 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
881 ; VI-NEXT: flat_load_ubyte v1, v[2:3]
882 ; VI-NEXT: flat_load_ubyte v2, v[4:5]
883 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
884 ; VI-NEXT: flat_load_ubyte v4, v[8:9]
885 ; VI-NEXT: flat_load_ubyte v5, v[10:11]
886 ; VI-NEXT: flat_load_ubyte v6, v[12:13]
887 ; VI-NEXT: v_mov_b32_e32 v8, s1
888 ; VI-NEXT: v_mov_b32_e32 v7, s0
889 ; VI-NEXT: s_add_u32 s0, s0, 16
890 ; VI-NEXT: s_addc_u32 s1, s1, 0
891 ; VI-NEXT: v_mov_b32_e32 v10, s1
892 ; VI-NEXT: v_mov_b32_e32 v9, s0
893 ; VI-NEXT: s_waitcnt vmcnt(6)
894 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
895 ; VI-NEXT: s_waitcnt vmcnt(5)
896 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
897 ; VI-NEXT: s_waitcnt vmcnt(4)
898 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
899 ; VI-NEXT: s_waitcnt vmcnt(3)
900 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
901 ; VI-NEXT: s_waitcnt vmcnt(2)
902 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
903 ; VI-NEXT: s_waitcnt vmcnt(1)
904 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v5
905 ; VI-NEXT: s_waitcnt vmcnt(0)
906 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
907 ; VI-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
908 ; VI-NEXT: flat_store_dwordx3 v[9:10], v[4:6]
910 %tid = call i32 @llvm.amdgcn.workitem.id.x()
911 %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
912 %load = load <7 x i8>, ptr addrspace(1) %gep, align 1
913 %cvt = uitofp <7 x i8> %load to <7 x float>
914 store <7 x float> %cvt, ptr addrspace(1) %out, align 16
918 define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
919 ; SI-LABEL: load_v8i8_to_v8f32:
921 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
922 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
923 ; SI-NEXT: v_mov_b32_e32 v1, 0
924 ; SI-NEXT: s_mov_b32 s6, 0
925 ; SI-NEXT: s_mov_b32 s7, 0xf000
926 ; SI-NEXT: s_waitcnt lgkmcnt(0)
927 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
928 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
929 ; SI-NEXT: s_mov_b32 s6, -1
930 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
931 ; SI-NEXT: s_waitcnt vmcnt(0)
932 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v0
933 ; SI-NEXT: v_bfe_u32 v4, v0, 8, 8
934 ; SI-NEXT: v_bfe_u32 v5, v0, 16, 8
935 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
936 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
937 ; SI-NEXT: v_bfe_u32 v8, v1, 8, 8
938 ; SI-NEXT: v_bfe_u32 v9, v1, 16, 8
939 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v1
940 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
941 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
942 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5
943 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
944 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v8
945 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9
946 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
947 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
950 ; VI-LABEL: load_v8i8_to_v8f32:
952 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
953 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
954 ; VI-NEXT: s_waitcnt lgkmcnt(0)
955 ; VI-NEXT: v_mov_b32_e32 v0, s2
956 ; VI-NEXT: v_mov_b32_e32 v1, s3
957 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
958 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
959 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
960 ; VI-NEXT: v_mov_b32_e32 v9, s1
961 ; VI-NEXT: v_mov_b32_e32 v8, s0
962 ; VI-NEXT: s_add_u32 s0, s0, 16
963 ; VI-NEXT: s_addc_u32 s1, s1, 0
964 ; VI-NEXT: v_mov_b32_e32 v11, s1
965 ; VI-NEXT: v_mov_b32_e32 v10, s0
966 ; VI-NEXT: s_waitcnt vmcnt(0)
967 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
968 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
969 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
970 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v6
971 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
972 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
973 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
974 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v7
975 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
976 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
978 %tid = call i32 @llvm.amdgcn.workitem.id.x()
979 %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
980 %load = load <8 x i8>, ptr addrspace(1) %gep, align 8
981 %cvt = uitofp <8 x i8> %load to <8 x float>
982 store <8 x float> %cvt, ptr addrspace(1) %out, align 16
986 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
987 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
989 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
990 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
991 ; SI-NEXT: v_mov_b32_e32 v1, 0
992 ; SI-NEXT: s_mov_b32 s6, 0
993 ; SI-NEXT: s_mov_b32 s7, 0xf000
994 ; SI-NEXT: s_waitcnt lgkmcnt(0)
995 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
996 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
997 ; SI-NEXT: s_mov_b32 s6, -1
998 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
999 ; SI-NEXT: s_waitcnt vmcnt(0)
1000 ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
1001 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1002 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1003 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1006 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
1008 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1009 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1010 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1011 ; VI-NEXT: v_mov_b32_e32 v0, s2
1012 ; VI-NEXT: v_mov_b32_e32 v1, s3
1013 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1014 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1015 ; VI-NEXT: flat_load_dword v0, v[0:1]
1016 ; VI-NEXT: s_waitcnt vmcnt(0)
1017 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
1018 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1019 ; VI-NEXT: v_mov_b32_e32 v0, s0
1020 ; VI-NEXT: v_mov_b32_e32 v1, s1
1021 ; VI-NEXT: flat_store_dword v[0:1], v2
1023 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1024 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1025 %load = load i32, ptr addrspace(1) %gep, align 4
1026 %add = add i32 %load, 2
1027 %inreg = and i32 %add, 255
1028 %cvt = uitofp i32 %inreg to float
1029 store float %cvt, ptr addrspace(1) %out, align 4
1033 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1034 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
1036 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1037 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1038 ; SI-NEXT: v_mov_b32_e32 v1, 0
1039 ; SI-NEXT: s_mov_b32 s6, 0
1040 ; SI-NEXT: s_mov_b32 s7, 0xf000
1041 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1042 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1043 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1044 ; SI-NEXT: s_mov_b32 s6, -1
1045 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1046 ; SI-NEXT: s_waitcnt vmcnt(0)
1047 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
1048 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1049 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1052 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
1054 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1055 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1056 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1057 ; VI-NEXT: v_mov_b32_e32 v0, s2
1058 ; VI-NEXT: v_mov_b32_e32 v1, s3
1059 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1060 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1061 ; VI-NEXT: flat_load_dword v0, v[0:1]
1062 ; VI-NEXT: s_waitcnt vmcnt(0)
1063 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1064 ; VI-NEXT: v_mov_b32_e32 v0, s0
1065 ; VI-NEXT: v_mov_b32_e32 v1, s1
1066 ; VI-NEXT: flat_store_dword v[0:1], v2
1068 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1069 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1070 %load = load i32, ptr addrspace(1) %gep, align 4
1071 %inreg = and i32 %load, 65280
1072 %shr = lshr i32 %inreg, 8
1073 %cvt = uitofp i32 %shr to float
1074 store float %cvt, ptr addrspace(1) %out, align 4
1078 ; We don't get these ones because of the zext, but instcombine removes
1079 ; them so it shouldn't really matter.
1080 define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1081 ; SI-LABEL: i8_zext_i32_to_f32:
1083 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1084 ; SI-NEXT: s_mov_b32 s6, 0
1085 ; SI-NEXT: s_mov_b32 s7, 0xf000
1086 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1087 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1088 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1089 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1090 ; SI-NEXT: s_mov_b32 s6, -1
1091 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1092 ; SI-NEXT: s_waitcnt vmcnt(0)
1093 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1094 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1097 ; VI-LABEL: i8_zext_i32_to_f32:
1099 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1100 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1101 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1102 ; VI-NEXT: v_mov_b32_e32 v1, s2
1103 ; VI-NEXT: v_mov_b32_e32 v2, s3
1104 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
1105 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
1106 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1107 ; VI-NEXT: s_waitcnt vmcnt(0)
1108 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
1109 ; VI-NEXT: v_mov_b32_e32 v0, s0
1110 ; VI-NEXT: v_mov_b32_e32 v1, s1
1111 ; VI-NEXT: flat_store_dword v[0:1], v2
1113 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1114 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
1115 %load = load i8, ptr addrspace(1) %gep, align 1
1116 %ext = zext i8 %load to i32
1117 %cvt = uitofp i32 %ext to float
1118 store float %cvt, ptr addrspace(1) %out, align 4
1122 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1123 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
1125 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1126 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1127 ; SI-NEXT: v_mov_b32_e32 v1, 0
1128 ; SI-NEXT: s_mov_b32 s6, 0
1129 ; SI-NEXT: s_mov_b32 s7, 0xf000
1130 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1131 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1132 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
1133 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
1134 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
1135 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1136 ; SI-NEXT: s_mov_b32 s6, -1
1137 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1138 ; SI-NEXT: s_waitcnt vmcnt(3)
1139 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1140 ; SI-NEXT: s_waitcnt vmcnt(2)
1141 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1142 ; SI-NEXT: s_waitcnt vmcnt(1)
1143 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1144 ; SI-NEXT: s_waitcnt vmcnt(0)
1145 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1146 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
1147 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1148 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
1149 ; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
1150 ; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
1151 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1152 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
1153 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
1154 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1155 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1158 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
1160 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1161 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1162 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1163 ; VI-NEXT: v_mov_b32_e32 v0, s2
1164 ; VI-NEXT: v_mov_b32_e32 v1, s3
1165 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1166 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1167 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
1168 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1169 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
1170 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1171 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
1172 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1173 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1174 ; VI-NEXT: flat_load_ubyte v3, v[6:7]
1175 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
1176 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1177 ; VI-NEXT: s_waitcnt vmcnt(3)
1178 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1179 ; VI-NEXT: s_waitcnt vmcnt(2)
1180 ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1181 ; VI-NEXT: s_waitcnt vmcnt(1)
1182 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1183 ; VI-NEXT: s_waitcnt vmcnt(0)
1184 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1185 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
1186 ; VI-NEXT: v_or_b32_e32 v3, v1, v0
1187 ; VI-NEXT: v_mov_b32_e32 v5, s1
1188 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1189 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1190 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1191 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
1192 ; VI-NEXT: v_mov_b32_e32 v4, s0
1193 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1195 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1196 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1197 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1198 %ext = zext <4 x i8> %load to <4 x i32>
1199 %cvt = uitofp <4 x i32> %ext to <4 x float>
1200 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1204 define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1205 ; SI-LABEL: extract_byte0_to_f32:
1207 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1208 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1209 ; SI-NEXT: v_mov_b32_e32 v1, 0
1210 ; SI-NEXT: s_mov_b32 s6, 0
1211 ; SI-NEXT: s_mov_b32 s7, 0xf000
1212 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1213 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1214 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1215 ; SI-NEXT: s_mov_b32 s6, -1
1216 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1217 ; SI-NEXT: s_waitcnt vmcnt(0)
1218 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1219 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1222 ; VI-LABEL: extract_byte0_to_f32:
1224 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1225 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1226 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1227 ; VI-NEXT: v_mov_b32_e32 v0, s2
1228 ; VI-NEXT: v_mov_b32_e32 v1, s3
1229 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1230 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1231 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1232 ; VI-NEXT: s_waitcnt vmcnt(0)
1233 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
1234 ; VI-NEXT: v_mov_b32_e32 v0, s0
1235 ; VI-NEXT: v_mov_b32_e32 v1, s1
1236 ; VI-NEXT: flat_store_dword v[0:1], v2
1238 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1239 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1240 %val = load i32, ptr addrspace(1) %gep
1241 %and = and i32 %val, 255
1242 %cvt = uitofp i32 %and to float
1243 store float %cvt, ptr addrspace(1) %out
1247 define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1248 ; SI-LABEL: extract_byte1_to_f32:
1250 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1251 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1252 ; SI-NEXT: v_mov_b32_e32 v1, 0
1253 ; SI-NEXT: s_mov_b32 s6, 0
1254 ; SI-NEXT: s_mov_b32 s7, 0xf000
1255 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1256 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1257 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1258 ; SI-NEXT: s_mov_b32 s6, -1
1259 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1260 ; SI-NEXT: s_waitcnt vmcnt(0)
1261 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
1262 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1263 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1266 ; VI-LABEL: extract_byte1_to_f32:
1268 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1269 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1271 ; VI-NEXT: v_mov_b32_e32 v0, s2
1272 ; VI-NEXT: v_mov_b32_e32 v1, s3
1273 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1274 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1275 ; VI-NEXT: flat_load_dword v0, v[0:1]
1276 ; VI-NEXT: s_waitcnt vmcnt(0)
1277 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1278 ; VI-NEXT: v_mov_b32_e32 v0, s0
1279 ; VI-NEXT: v_mov_b32_e32 v1, s1
1280 ; VI-NEXT: flat_store_dword v[0:1], v2
1282 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1283 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1284 %val = load i32, ptr addrspace(1) %gep
1285 %srl = lshr i32 %val, 8
1286 %and = and i32 %srl, 255
1287 %cvt = uitofp i32 %and to float
1288 store float %cvt, ptr addrspace(1) %out
1292 define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1293 ; SI-LABEL: extract_byte2_to_f32:
1295 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1296 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1297 ; SI-NEXT: v_mov_b32_e32 v1, 0
1298 ; SI-NEXT: s_mov_b32 s6, 0
1299 ; SI-NEXT: s_mov_b32 s7, 0xf000
1300 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1301 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1302 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1303 ; SI-NEXT: s_mov_b32 s6, -1
1304 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1305 ; SI-NEXT: s_waitcnt vmcnt(0)
1306 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
1307 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1308 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1311 ; VI-LABEL: extract_byte2_to_f32:
1313 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1314 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1315 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1316 ; VI-NEXT: v_mov_b32_e32 v0, s2
1317 ; VI-NEXT: v_mov_b32_e32 v1, s3
1318 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1319 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1320 ; VI-NEXT: flat_load_dword v0, v[0:1]
1321 ; VI-NEXT: s_waitcnt vmcnt(0)
1322 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1323 ; VI-NEXT: v_mov_b32_e32 v0, s0
1324 ; VI-NEXT: v_mov_b32_e32 v1, s1
1325 ; VI-NEXT: flat_store_dword v[0:1], v2
1327 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1328 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1329 %val = load i32, ptr addrspace(1) %gep
1330 %srl = lshr i32 %val, 16
1331 %and = and i32 %srl, 255
1332 %cvt = uitofp i32 %and to float
1333 store float %cvt, ptr addrspace(1) %out
1337 define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1338 ; SI-LABEL: extract_byte3_to_f32:
1340 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1341 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1342 ; SI-NEXT: v_mov_b32_e32 v1, 0
1343 ; SI-NEXT: s_mov_b32 s6, 0
1344 ; SI-NEXT: s_mov_b32 s7, 0xf000
1345 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1346 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1347 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1348 ; SI-NEXT: s_mov_b32 s6, -1
1349 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1350 ; SI-NEXT: s_waitcnt vmcnt(0)
1351 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
1352 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1355 ; VI-LABEL: extract_byte3_to_f32:
1357 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1358 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1359 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1360 ; VI-NEXT: v_mov_b32_e32 v0, s2
1361 ; VI-NEXT: v_mov_b32_e32 v1, s3
1362 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1363 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1364 ; VI-NEXT: flat_load_dword v0, v[0:1]
1365 ; VI-NEXT: s_waitcnt vmcnt(0)
1366 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0
1367 ; VI-NEXT: v_mov_b32_e32 v0, s0
1368 ; VI-NEXT: v_mov_b32_e32 v1, s1
1369 ; VI-NEXT: flat_store_dword v[0:1], v2
1371 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1372 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1373 %val = load i32, ptr addrspace(1) %gep
1374 %srl = lshr i32 %val, 24
1375 %and = and i32 %srl, 255
1376 %cvt = uitofp i32 %and to float
1377 store float %cvt, ptr addrspace(1) %out
1381 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) {
1382 ; SI-LABEL: cvt_ubyte0_or_multiuse:
1383 ; SI: ; %bb.0: ; %bb
1384 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1385 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1386 ; SI-NEXT: v_mov_b32_e32 v1, 0
1387 ; SI-NEXT: s_mov_b32 s6, 0
1388 ; SI-NEXT: s_mov_b32 s7, 0xf000
1389 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1390 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
1391 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1392 ; SI-NEXT: s_mov_b32 s6, -1
1393 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1394 ; SI-NEXT: s_waitcnt vmcnt(0)
1395 ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1396 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
1397 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1398 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
1399 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1402 ; VI-LABEL: cvt_ubyte0_or_multiuse:
1403 ; VI: ; %bb.0: ; %bb
1404 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1405 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1406 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1407 ; VI-NEXT: v_mov_b32_e32 v0, s0
1408 ; VI-NEXT: v_mov_b32_e32 v1, s1
1409 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1410 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1411 ; VI-NEXT: flat_load_dword v0, v[0:1]
1412 ; VI-NEXT: s_waitcnt vmcnt(0)
1413 ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1414 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1415 ; VI-NEXT: v_add_f32_e32 v2, v0, v1
1416 ; VI-NEXT: v_mov_b32_e32 v0, s2
1417 ; VI-NEXT: v_mov_b32_e32 v1, s3
1418 ; VI-NEXT: flat_store_dword v[0:1], v2
1421 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1422 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid
1423 %load = load i32, ptr addrspace(1) %gep
1424 %or = or i32 %load, -2147483647
1425 %and = and i32 %or, 255
1426 %uitofp = uitofp i32 %and to float
1427 %cast = bitcast i32 %or to float
1428 %add = fadd float %cast, %uitofp
1429 store float %add, ptr addrspace(1) %out
1433 define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
1434 ; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
1436 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1437 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1438 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
1439 ; SI-NEXT: v_ffbh_i32_e32 v3, 0
1440 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
1441 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v3
1442 ; SI-NEXT: v_mov_b32_e32 v1, 0
1443 ; SI-NEXT: v_min_u32_e32 v2, v3, v2
1444 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
1445 ; SI-NEXT: v_min_u32_e32 v0, 1, v0
1446 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1447 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
1448 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
1449 ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
1450 ; SI-NEXT: s_setpc_b64 s[30:31]
1452 ; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
1454 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1455 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1456 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
1457 ; VI-NEXT: v_ffbh_i32_e32 v3, 0
1458 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
1459 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v3
1460 ; VI-NEXT: v_mov_b32_e32 v1, 0
1461 ; VI-NEXT: v_min_u32_e32 v2, v3, v2
1462 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
1463 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1464 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1465 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
1466 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
1467 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
1468 ; VI-NEXT: s_setpc_b64 s[30:31]
1469 %masked = and i64 %arg0, 255
1470 %itofp = sitofp i64 %masked to float
1474 define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
1475 ; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
1477 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1478 ; SI-NEXT: v_ffbh_u32_e32 v2, 0
1479 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1480 ; SI-NEXT: v_mov_b32_e32 v1, 0
1481 ; SI-NEXT: v_min_u32_e32 v2, 32, v2
1482 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
1483 ; SI-NEXT: v_min_u32_e32 v0, 1, v0
1484 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1485 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0
1486 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
1487 ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
1488 ; SI-NEXT: s_setpc_b64 s[30:31]
1490 ; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
1492 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1493 ; VI-NEXT: v_ffbh_u32_e32 v2, 0
1494 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1495 ; VI-NEXT: v_mov_b32_e32 v1, 0
1496 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1497 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
1498 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1499 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1500 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1501 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
1502 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
1503 ; VI-NEXT: s_setpc_b64 s[30:31]
1504 %masked = and i64 %arg0, 255
1505 %itofp = uitofp i64 %masked to float
1509 define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
1510 ; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
1512 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1513 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1514 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
1515 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1516 ; SI-NEXT: s_setpc_b64 s[30:31]
1518 ; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
1520 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1521 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1522 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1523 ; VI-NEXT: s_setpc_b64 s[30:31]
1524 %masked = and i16 %arg0, 255
1525 %itofp = sitofp i16 %masked to float
1529 define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
1530 ; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
1532 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1533 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1534 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1535 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1536 ; SI-NEXT: s_setpc_b64 s[30:31]
1538 ; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
1540 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1541 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
1542 ; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1543 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1544 ; VI-NEXT: s_setpc_b64 s[30:31]
1545 %masked = and i16 %arg0, 255
1546 %itofp = uitofp i16 %masked to float