1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
5 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
6 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
8 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
9 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
11 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
12 ; GCN-LABEL: v_uitofp_i32_to_f32_mask255:
14 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
16 ; GCN-NEXT: s_setpc_b64 s[30:31]
18 ; GFX10-LABEL: v_uitofp_i32_to_f32_mask255:
20 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
22 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24 ; GFX9-LABEL: v_uitofp_i32_to_f32_mask255:
26 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
28 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30 ; GFX11-LABEL: v_uitofp_i32_to_f32_mask255:
32 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
34 ; GFX11-NEXT: s_setpc_b64 s[30:31]
35 %masked = and i32 %arg0, 255
36 %cvt = uitofp i32 %masked to float
40 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
41 ; GCN-LABEL: v_sitofp_i32_to_f32_mask255:
43 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
45 ; GCN-NEXT: s_setpc_b64 s[30:31]
47 ; GFX10-LABEL: v_sitofp_i32_to_f32_mask255:
49 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
51 ; GFX10-NEXT: s_setpc_b64 s[30:31]
53 ; GFX9-LABEL: v_sitofp_i32_to_f32_mask255:
55 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
57 ; GFX9-NEXT: s_setpc_b64 s[30:31]
59 ; GFX11-LABEL: v_sitofp_i32_to_f32_mask255:
61 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
63 ; GFX11-NEXT: s_setpc_b64 s[30:31]
64 %masked = and i32 %arg0, 255
65 %cvt = sitofp i32 %masked to float
69 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
70 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
72 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0
74 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
75 ; GCN-NEXT: s_setpc_b64 s[30:31]
77 ; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255:
79 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0
81 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
82 ; GFX10-NEXT: s_setpc_b64 s[30:31]
84 ; GFX9-LABEL: v_uitofp_to_f32_lshr7_mask255:
86 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 7, v0
88 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
89 ; GFX9-NEXT: s_setpc_b64 s[30:31]
91 ; GFX11-LABEL: v_uitofp_to_f32_lshr7_mask255:
93 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 7, v0
95 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
96 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
97 ; GFX11-NEXT: s_setpc_b64 s[30:31]
98 %lshr.7 = lshr i32 %arg0, 7
99 %masked = and i32 %lshr.7, 255
100 %cvt = uitofp i32 %masked to float
104 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
105 ; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255:
107 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
109 ; GCN-NEXT: s_setpc_b64 s[30:31]
111 ; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255:
113 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
115 ; GFX10-NEXT: s_setpc_b64 s[30:31]
117 ; GFX9-LABEL: v_uitofp_to_f32_lshr8_mask255:
119 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
121 ; GFX9-NEXT: s_setpc_b64 s[30:31]
123 ; GFX11-LABEL: v_uitofp_to_f32_lshr8_mask255:
125 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
127 ; GFX11-NEXT: s_setpc_b64 s[30:31]
128 %lshr.8 = lshr i32 %arg0, 8
129 %masked = and i32 %lshr.8, 255
130 %cvt = uitofp i32 %masked to float
134 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
135 ; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
137 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
139 ; SI-NEXT: s_mov_b32 s7, 0xf000
140 ; SI-NEXT: s_mov_b32 s6, -1
141 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
142 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
143 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
144 ; SI-NEXT: s_setpc_b64 s[30:31]
146 ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
148 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
150 ; VI-NEXT: s_mov_b32 s7, 0xf000
151 ; VI-NEXT: s_mov_b32 s6, -1
152 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
153 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
154 ; VI-NEXT: s_waitcnt vmcnt(0)
155 ; VI-NEXT: s_setpc_b64 s[30:31]
157 ; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
159 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
161 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
162 ; GFX10-NEXT: global_store_dword v[0:1], v1, off
163 ; GFX10-NEXT: s_setpc_b64 s[30:31]
165 ; GFX9-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
167 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
169 ; GFX9-NEXT: global_store_dword v[0:1], v1, off
170 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
171 ; GFX9-NEXT: s_waitcnt vmcnt(0)
172 ; GFX9-NEXT: s_setpc_b64 s[30:31]
174 ; GFX11-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
176 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
178 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
179 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off
180 ; GFX11-NEXT: s_setpc_b64 s[30:31]
181 %lshr.8 = lshr i32 %arg0, 8
182 store i32 %lshr.8, ptr addrspace(1) undef
183 %masked = and i32 %lshr.8, 255
184 %cvt = uitofp i32 %masked to float
188 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
189 ; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255:
191 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
193 ; GCN-NEXT: s_setpc_b64 s[30:31]
195 ; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255:
197 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
199 ; GFX10-NEXT: s_setpc_b64 s[30:31]
201 ; GFX9-LABEL: v_uitofp_to_f32_lshr16_mask255:
203 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
205 ; GFX9-NEXT: s_setpc_b64 s[30:31]
207 ; GFX11-LABEL: v_uitofp_to_f32_lshr16_mask255:
209 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
211 ; GFX11-NEXT: s_setpc_b64 s[30:31]
212 %lshr.16 = lshr i32 %arg0, 16
213 %masked = and i32 %lshr.16, 255
214 %cvt = uitofp i32 %masked to float
218 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
219 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
221 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
223 ; GCN-NEXT: s_setpc_b64 s[30:31]
225 ; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255:
227 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
229 ; GFX10-NEXT: s_setpc_b64 s[30:31]
231 ; GFX9-LABEL: v_uitofp_to_f32_lshr24_mask255:
233 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
235 ; GFX9-NEXT: s_setpc_b64 s[30:31]
237 ; GFX11-LABEL: v_uitofp_to_f32_lshr24_mask255:
239 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
241 ; GFX11-NEXT: s_setpc_b64 s[30:31]
242 %lshr.16 = lshr i32 %arg0, 24
243 %masked = and i32 %lshr.16, 255
244 %cvt = uitofp i32 %masked to float
248 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
249 ; GCN-LABEL: v_uitofp_i8_to_f32:
251 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
253 ; GCN-NEXT: s_setpc_b64 s[30:31]
255 ; GFX10-LABEL: v_uitofp_i8_to_f32:
257 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
259 ; GFX10-NEXT: s_setpc_b64 s[30:31]
261 ; GFX9-LABEL: v_uitofp_i8_to_f32:
263 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
265 ; GFX9-NEXT: s_setpc_b64 s[30:31]
267 ; GFX11-LABEL: v_uitofp_i8_to_f32:
269 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
271 ; GFX11-NEXT: s_setpc_b64 s[30:31]
272 %cvt = uitofp i8 %arg0 to float
276 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
277 ; GCN-LABEL: v_uitofp_v2i8_to_v2f32:
279 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
281 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
282 ; GCN-NEXT: v_mov_b32_e32 v0, v2
283 ; GCN-NEXT: s_setpc_b64 s[30:31]
285 ; GFX10-LABEL: v_uitofp_v2i8_to_v2f32:
287 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
289 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
290 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
291 ; GFX10-NEXT: s_setpc_b64 s[30:31]
293 ; GFX9-LABEL: v_uitofp_v2i8_to_v2f32:
295 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
297 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
298 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
299 ; GFX9-NEXT: s_setpc_b64 s[30:31]
301 ; GFX11-LABEL: v_uitofp_v2i8_to_v2f32:
303 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
305 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
306 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
307 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
308 ; GFX11-NEXT: s_setpc_b64 s[30:31]
309 %val = bitcast i16 %arg0 to <2 x i8>
310 %cvt = uitofp <2 x i8> %val to <2 x float>
314 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
315 ; GCN-LABEL: v_uitofp_v3i8_to_v3f32:
317 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
319 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
320 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
321 ; GCN-NEXT: v_mov_b32_e32 v0, v3
322 ; GCN-NEXT: s_setpc_b64 s[30:31]
324 ; GFX10-LABEL: v_uitofp_v3i8_to_v3f32:
326 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
328 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
329 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
330 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
331 ; GFX10-NEXT: s_setpc_b64 s[30:31]
333 ; GFX9-LABEL: v_uitofp_v3i8_to_v3f32:
335 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
337 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
338 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
339 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
340 ; GFX9-NEXT: s_setpc_b64 s[30:31]
342 ; GFX11-LABEL: v_uitofp_v3i8_to_v3f32:
344 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
346 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
347 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
348 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
349 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
350 ; GFX11-NEXT: s_setpc_b64 s[30:31]
351 %trunc = trunc i32 %arg0 to i24
352 %val = bitcast i24 %trunc to <3 x i8>
353 %cvt = uitofp <3 x i8> %val to <3 x float>
357 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
358 ; GCN-LABEL: v_uitofp_v4i8_to_v4f32:
360 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
361 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
362 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
363 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
364 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
365 ; GCN-NEXT: v_mov_b32_e32 v0, v4
366 ; GCN-NEXT: s_setpc_b64 s[30:31]
368 ; GFX10-LABEL: v_uitofp_v4i8_to_v4f32:
370 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
372 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
373 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
374 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
375 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
376 ; GFX10-NEXT: s_setpc_b64 s[30:31]
378 ; GFX9-LABEL: v_uitofp_v4i8_to_v4f32:
380 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
382 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
383 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
384 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
385 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
386 ; GFX9-NEXT: s_setpc_b64 s[30:31]
388 ; GFX11-LABEL: v_uitofp_v4i8_to_v4f32:
390 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
392 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
393 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
394 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
395 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
396 ; GFX11-NEXT: v_mov_b32_e32 v0, v4
397 ; GFX11-NEXT: s_setpc_b64 s[30:31]
398 %val = bitcast i32 %arg0 to <4 x i8>
399 %cvt = uitofp <4 x i8> %val to <4 x float>
403 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
404 ; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32:
406 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
408 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
409 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
410 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
411 ; GCN-NEXT: v_mov_b32_e32 v0, v4
412 ; GCN-NEXT: s_setpc_b64 s[30:31]
414 ; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32:
416 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
418 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
419 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
420 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
421 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
422 ; GFX10-NEXT: s_setpc_b64 s[30:31]
424 ; GFX9-LABEL: v_uitofp_unpack_i32_to_v4f32:
426 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
428 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
429 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
430 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
431 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
432 ; GFX9-NEXT: s_setpc_b64 s[30:31]
434 ; GFX11-LABEL: v_uitofp_unpack_i32_to_v4f32:
436 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
438 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
439 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
440 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
441 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
442 ; GFX11-NEXT: v_mov_b32_e32 v0, v4
443 ; GFX11-NEXT: s_setpc_b64 s[30:31]
444 %mask.arg0 = and i32 %arg0, 255
445 %cvt0 = uitofp i32 %mask.arg0 to float
447 %lshr.8 = lshr i32 %arg0, 8
448 %mask.lshr.8 = and i32 %lshr.8, 255
449 %cvt1 = uitofp i32 %mask.lshr.8 to float
451 %lshr.16 = lshr i32 %arg0, 16
452 %mask.lshr.16 = and i32 %lshr.16, 255
453 %cvt2 = uitofp i32 %mask.lshr.16 to float
455 %lshr.24 = lshr i32 %arg0, 24
456 %mask.lshr.24 = and i32 %lshr.24, 255
457 %cvt3 = uitofp i32 %mask.lshr.24 to float
459 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
460 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
461 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
462 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
463 ret <4 x float> %ins.3
466 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
467 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
469 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
471 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
472 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
473 ; SI-NEXT: s_setpc_b64 s[30:31]
475 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
477 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
479 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
480 ; VI-NEXT: s_setpc_b64 s[30:31]
482 ; GFX10-LABEL: v_uitofp_i32_to_f16_mask255:
484 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
486 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
487 ; GFX10-NEXT: s_setpc_b64 s[30:31]
489 ; GFX9-LABEL: v_uitofp_i32_to_f16_mask255:
491 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
493 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
494 ; GFX9-NEXT: s_setpc_b64 s[30:31]
496 ; GFX11-LABEL: v_uitofp_i32_to_f16_mask255:
498 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
500 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
501 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
502 ; GFX11-NEXT: s_setpc_b64 s[30:31]
503 %masked = and i32 %arg0, 255
504 %cvt = uitofp i32 %masked to half
508 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
509 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
511 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
513 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
514 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
515 ; SI-NEXT: s_setpc_b64 s[30:31]
517 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
519 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
521 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
522 ; VI-NEXT: s_setpc_b64 s[30:31]
524 ; GFX10-LABEL: v_sitofp_i32_to_f16_mask255:
526 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
527 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
528 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
529 ; GFX10-NEXT: s_setpc_b64 s[30:31]
531 ; GFX9-LABEL: v_sitofp_i32_to_f16_mask255:
533 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
535 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
536 ; GFX9-NEXT: s_setpc_b64 s[30:31]
538 ; GFX11-LABEL: v_sitofp_i32_to_f16_mask255:
540 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
542 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
543 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
544 ; GFX11-NEXT: s_setpc_b64 s[30:31]
545 %masked = and i32 %arg0, 255
546 %cvt = sitofp i32 %masked to half
550 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
551 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
553 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
554 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
555 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
556 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
557 ; SI-NEXT: s_setpc_b64 s[30:31]
559 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
561 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
563 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
564 ; VI-NEXT: s_setpc_b64 s[30:31]
566 ; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255:
568 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
570 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
571 ; GFX10-NEXT: s_setpc_b64 s[30:31]
573 ; GFX9-LABEL: v_uitofp_to_f16_lshr8_mask255:
575 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
577 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
578 ; GFX9-NEXT: s_setpc_b64 s[30:31]
580 ; GFX11-LABEL: v_uitofp_to_f16_lshr8_mask255:
582 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
583 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
584 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
585 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
586 ; GFX11-NEXT: s_setpc_b64 s[30:31]
587 %lshr.8 = lshr i32 %arg0, 8
588 %masked = and i32 %lshr.8, 255
589 %cvt = uitofp i32 %masked to half
593 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
594 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
596 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
598 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
599 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
600 ; SI-NEXT: s_setpc_b64 s[30:31]
602 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
604 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
606 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
607 ; VI-NEXT: s_setpc_b64 s[30:31]
609 ; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255:
611 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
613 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
614 ; GFX10-NEXT: s_setpc_b64 s[30:31]
616 ; GFX9-LABEL: v_uitofp_to_f16_lshr16_mask255:
618 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
620 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
621 ; GFX9-NEXT: s_setpc_b64 s[30:31]
623 ; GFX11-LABEL: v_uitofp_to_f16_lshr16_mask255:
625 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
627 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
628 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
629 ; GFX11-NEXT: s_setpc_b64 s[30:31]
630 %lshr.16 = lshr i32 %arg0, 16
631 %masked = and i32 %lshr.16, 255
632 %cvt = uitofp i32 %masked to half
636 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
637 ; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
639 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
641 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
642 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
643 ; SI-NEXT: s_setpc_b64 s[30:31]
645 ; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
647 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
649 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
650 ; VI-NEXT: s_setpc_b64 s[30:31]
652 ; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255:
654 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
656 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
657 ; GFX10-NEXT: s_setpc_b64 s[30:31]
659 ; GFX9-LABEL: v_uitofp_to_f16_lshr24_mask255:
661 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
662 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
663 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
664 ; GFX9-NEXT: s_setpc_b64 s[30:31]
666 ; GFX11-LABEL: v_uitofp_to_f16_lshr24_mask255:
668 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
669 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
670 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
671 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
672 ; GFX11-NEXT: s_setpc_b64 s[30:31]
673 %lshr.16 = lshr i32 %arg0, 24
674 %masked = and i32 %lshr.16, 255
675 %cvt = uitofp i32 %masked to half
679 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
680 ; SI-LABEL: v_uitofp_i8_to_f16:
682 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
684 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
685 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
686 ; SI-NEXT: s_setpc_b64 s[30:31]
688 ; VI-LABEL: v_uitofp_i8_to_f16:
690 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691 ; VI-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
692 ; VI-NEXT: s_setpc_b64 s[30:31]
694 ; GFX10-LABEL: v_uitofp_i8_to_f16:
696 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
698 ; GFX10-NEXT: s_setpc_b64 s[30:31]
700 ; GFX9-LABEL: v_uitofp_i8_to_f16:
702 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; GFX9-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
704 ; GFX9-NEXT: s_setpc_b64 s[30:31]
706 ; GFX11-LABEL: v_uitofp_i8_to_f16:
708 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
710 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
711 ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
712 ; GFX11-NEXT: s_setpc_b64 s[30:31]
713 %cvt = uitofp i8 %arg0 to half
717 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
718 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
720 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
722 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
723 ; GCN-NEXT: s_setpc_b64 s[30:31]
725 ; GFX10-LABEL: v_uitofp_i32_to_f64_mask255:
727 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
729 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
730 ; GFX10-NEXT: s_setpc_b64 s[30:31]
732 ; GFX9-LABEL: v_uitofp_i32_to_f64_mask255:
734 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
736 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
737 ; GFX9-NEXT: s_setpc_b64 s[30:31]
739 ; GFX11-LABEL: v_uitofp_i32_to_f64_mask255:
741 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
743 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
744 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
745 ; GFX11-NEXT: s_setpc_b64 s[30:31]
746 %masked = and i32 %arg0, 255
747 %cvt = uitofp i32 %masked to double
751 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
752 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
754 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755 ; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8
756 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
757 ; GCN-NEXT: s_setpc_b64 s[30:31]
759 ; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255:
761 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762 ; GFX10-NEXT: v_bfe_u32 v0, v0, 8, 8
763 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
764 ; GFX10-NEXT: s_setpc_b64 s[30:31]
766 ; GFX9-LABEL: v_uitofp_to_f64_lshr8_mask255:
768 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
769 ; GFX9-NEXT: v_bfe_u32 v0, v0, 8, 8
770 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
771 ; GFX9-NEXT: s_setpc_b64 s[30:31]
773 ; GFX11-LABEL: v_uitofp_to_f64_lshr8_mask255:
775 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776 ; GFX11-NEXT: v_bfe_u32 v0, v0, 8, 8
777 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
778 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
779 ; GFX11-NEXT: s_setpc_b64 s[30:31]
780 %lshr.8 = lshr i32 %arg0, 8
781 %masked = and i32 %lshr.8, 255
782 %cvt = uitofp i32 %masked to double
786 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
787 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
789 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8
791 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
792 ; GCN-NEXT: s_setpc_b64 s[30:31]
794 ; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255:
796 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
797 ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8
798 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
799 ; GFX10-NEXT: s_setpc_b64 s[30:31]
801 ; GFX9-LABEL: v_uitofp_to_f64_lshr16_mask255:
803 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
804 ; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8
805 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
806 ; GFX9-NEXT: s_setpc_b64 s[30:31]
808 ; GFX11-LABEL: v_uitofp_to_f64_lshr16_mask255:
810 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
812 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
813 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
814 ; GFX11-NEXT: s_setpc_b64 s[30:31]
815 %lshr.16 = lshr i32 %arg0, 16
816 %masked = and i32 %lshr.16, 255
817 %cvt = uitofp i32 %masked to double
821 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
822 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
824 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0
826 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
827 ; GCN-NEXT: s_setpc_b64 s[30:31]
829 ; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255:
831 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
833 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
834 ; GFX10-NEXT: s_setpc_b64 s[30:31]
836 ; GFX9-LABEL: v_uitofp_to_f64_lshr24_mask255:
838 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
840 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
841 ; GFX9-NEXT: s_setpc_b64 s[30:31]
843 ; GFX11-LABEL: v_uitofp_to_f64_lshr24_mask255:
845 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
847 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
848 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
849 ; GFX11-NEXT: s_setpc_b64 s[30:31]
850 %lshr.16 = lshr i32 %arg0, 24
851 %masked = and i32 %lshr.16, 255
852 %cvt = uitofp i32 %masked to double
856 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
857 ; SI-LABEL: v_uitofp_i8_to_f64:
859 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
861 ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
862 ; SI-NEXT: s_setpc_b64 s[30:31]
864 ; VI-LABEL: v_uitofp_i8_to_f64:
866 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
867 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
868 ; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
869 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
870 ; VI-NEXT: s_setpc_b64 s[30:31]
872 ; GFX10-LABEL: v_uitofp_i8_to_f64:
874 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff
876 ; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
877 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
878 ; GFX10-NEXT: s_setpc_b64 s[30:31]
880 ; GFX9-LABEL: v_uitofp_i8_to_f64:
882 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
884 ; GFX9-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
885 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
886 ; GFX9-NEXT: s_setpc_b64 s[30:31]
888 ; GFX11-LABEL: v_uitofp_i8_to_f64:
890 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
892 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
893 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
894 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
895 ; GFX11-NEXT: s_setpc_b64 s[30:31]
896 %cvt = uitofp i8 %arg0 to double
900 define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
901 ; SI-LABEL: load_i8_to_f32:
903 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
904 ; SI-NEXT: s_mov_b32 s7, 0xf000
905 ; SI-NEXT: v_mov_b32_e32 v1, 0
906 ; SI-NEXT: s_mov_b32 s10, 0
907 ; SI-NEXT: s_mov_b32 s11, s7
908 ; SI-NEXT: s_waitcnt lgkmcnt(0)
909 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
910 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
911 ; SI-NEXT: s_mov_b32 s6, -1
912 ; SI-NEXT: s_mov_b32 s4, s0
913 ; SI-NEXT: s_mov_b32 s5, s1
914 ; SI-NEXT: s_waitcnt vmcnt(0)
915 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
916 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
919 ; VI-LABEL: load_i8_to_f32:
921 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
922 ; VI-NEXT: s_waitcnt lgkmcnt(0)
923 ; VI-NEXT: v_mov_b32_e32 v1, s3
924 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
925 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
926 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
927 ; VI-NEXT: s_mov_b32 s3, 0xf000
928 ; VI-NEXT: s_mov_b32 s2, -1
929 ; VI-NEXT: s_waitcnt vmcnt(0)
930 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
931 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
934 ; GFX10-LABEL: load_i8_to_f32:
936 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
937 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
938 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
939 ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7]
940 ; GFX10-NEXT: s_waitcnt vmcnt(0)
941 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
942 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
943 ; GFX10-NEXT: s_endpgm
945 ; GFX9-LABEL: load_i8_to_f32:
947 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
948 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
949 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7]
951 ; GFX9-NEXT: s_waitcnt vmcnt(0)
952 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
953 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
954 ; GFX9-NEXT: s_endpgm
956 ; GFX11-LABEL: load_i8_to_f32:
958 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
959 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
960 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
961 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
962 ; GFX11-NEXT: s_waitcnt vmcnt(0)
963 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
964 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
965 ; GFX11-NEXT: s_nop 0
966 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
967 ; GFX11-NEXT: s_endpgm
968 %tid = call i32 @llvm.amdgcn.workitem.id.x()
969 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
970 %load = load i8, ptr addrspace(1) %gep, align 1
971 %cvt = uitofp i8 %load to float
972 store float %cvt, ptr addrspace(1) %out, align 4
976 define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
977 ; SI-LABEL: load_v2i8_to_v2f32:
979 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
980 ; SI-NEXT: s_mov_b32 s7, 0xf000
981 ; SI-NEXT: s_mov_b32 s10, 0
982 ; SI-NEXT: s_mov_b32 s11, s7
983 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
984 ; SI-NEXT: s_waitcnt lgkmcnt(0)
985 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
986 ; SI-NEXT: v_mov_b32_e32 v1, 0
987 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
988 ; SI-NEXT: s_mov_b32 s6, -1
989 ; SI-NEXT: s_mov_b32 s4, s0
990 ; SI-NEXT: s_mov_b32 s5, s1
991 ; SI-NEXT: s_waitcnt vmcnt(0)
992 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
993 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
994 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
997 ; VI-LABEL: load_v2i8_to_v2f32:
999 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1000 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1001 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1002 ; VI-NEXT: v_mov_b32_e32 v1, s3
1003 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1004 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1005 ; VI-NEXT: flat_load_ushort v0, v[0:1]
1006 ; VI-NEXT: s_mov_b32 s3, 0xf000
1007 ; VI-NEXT: s_mov_b32 s2, -1
1008 ; VI-NEXT: s_waitcnt vmcnt(0)
1009 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1010 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1011 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1014 ; GFX10-LABEL: load_v2i8_to_v2f32:
1016 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1017 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1018 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1019 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1020 ; GFX10-NEXT: global_load_ushort v0, v0, s[6:7]
1021 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1022 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1023 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1024 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1025 ; GFX10-NEXT: s_endpgm
1027 ; GFX9-LABEL: load_v2i8_to_v2f32:
1029 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1030 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1031 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1032 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1033 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7]
1034 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1035 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1036 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1037 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1038 ; GFX9-NEXT: s_endpgm
1040 ; GFX11-LABEL: load_v2i8_to_v2f32:
1042 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1043 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1044 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1045 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1046 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1047 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1048 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
1049 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1050 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1051 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1052 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1053 ; GFX11-NEXT: s_nop 0
1054 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1055 ; GFX11-NEXT: s_endpgm
1056 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1057 %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
1058 %load = load <2 x i8>, ptr addrspace(1) %gep, align 2
1059 %cvt = uitofp <2 x i8> %load to <2 x float>
1060 store <2 x float> %cvt, ptr addrspace(1) %out, align 16
1064 define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1065 ; SI-LABEL: load_v3i8_to_v3f32:
1067 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1068 ; SI-NEXT: s_mov_b32 s7, 0xf000
1069 ; SI-NEXT: s_mov_b32 s10, 0
1070 ; SI-NEXT: s_mov_b32 s11, s7
1071 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1072 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1073 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1074 ; SI-NEXT: v_mov_b32_e32 v1, 0
1075 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1076 ; SI-NEXT: s_mov_b32 s6, -1
1077 ; SI-NEXT: s_mov_b32 s4, s0
1078 ; SI-NEXT: s_mov_b32 s5, s1
1079 ; SI-NEXT: s_waitcnt vmcnt(0)
1080 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2
1081 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1082 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
1083 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
1084 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1087 ; VI-LABEL: load_v3i8_to_v3f32:
1089 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1090 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1091 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1092 ; VI-NEXT: v_mov_b32_e32 v1, s3
1093 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1094 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1095 ; VI-NEXT: flat_load_dword v0, v[0:1]
1096 ; VI-NEXT: s_mov_b32 s3, 0xf000
1097 ; VI-NEXT: s_mov_b32 s2, -1
1098 ; VI-NEXT: s_waitcnt vmcnt(0)
1099 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1100 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1101 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1102 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
1105 ; GFX10-LABEL: load_v3i8_to_v3f32:
1107 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1108 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1109 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
1110 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1111 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
1112 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1113 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1114 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1115 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1116 ; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
1117 ; GFX10-NEXT: s_endpgm
1119 ; GFX9-LABEL: load_v3i8_to_v3f32:
1121 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1122 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1123 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1124 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1125 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
1126 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1127 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1128 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1129 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1130 ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
1131 ; GFX9-NEXT: s_endpgm
1133 ; GFX11-LABEL: load_v3i8_to_v3f32:
1135 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1136 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1137 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1138 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1139 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1140 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1141 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1142 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1143 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1144 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1145 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1]
1146 ; GFX11-NEXT: s_nop 0
1147 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1148 ; GFX11-NEXT: s_endpgm
1149 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1150 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
1151 %load = load <3 x i8>, ptr addrspace(1) %gep, align 4
1152 %cvt = uitofp <3 x i8> %load to <3 x float>
1153 store <3 x float> %cvt, ptr addrspace(1) %out, align 16
1157 define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1158 ; SI-LABEL: load_v4i8_to_v4f32:
1160 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1161 ; SI-NEXT: s_mov_b32 s7, 0xf000
1162 ; SI-NEXT: s_mov_b32 s10, 0
1163 ; SI-NEXT: s_mov_b32 s11, s7
1164 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1165 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1166 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1167 ; SI-NEXT: v_mov_b32_e32 v1, 0
1168 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1169 ; SI-NEXT: s_mov_b32 s6, -1
1170 ; SI-NEXT: s_mov_b32 s4, s0
1171 ; SI-NEXT: s_mov_b32 s5, s1
1172 ; SI-NEXT: s_waitcnt vmcnt(0)
1173 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1174 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1175 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1176 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1177 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1180 ; VI-LABEL: load_v4i8_to_v4f32:
1182 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1183 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1184 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1185 ; VI-NEXT: v_mov_b32_e32 v1, s3
1186 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1187 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1188 ; VI-NEXT: flat_load_dword v0, v[0:1]
1189 ; VI-NEXT: s_mov_b32 s3, 0xf000
1190 ; VI-NEXT: s_mov_b32 s2, -1
1191 ; VI-NEXT: s_waitcnt vmcnt(0)
1192 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1193 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1194 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1195 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1196 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1199 ; GFX10-LABEL: load_v4i8_to_v4f32:
1201 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1202 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1203 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1204 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1205 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
1206 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1208 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1209 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1210 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1211 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
1212 ; GFX10-NEXT: s_endpgm
1214 ; GFX9-LABEL: load_v4i8_to_v4f32:
1216 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1217 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1218 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1219 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1220 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
1221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1222 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1223 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1224 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1225 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1226 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
1227 ; GFX9-NEXT: s_endpgm
1229 ; GFX11-LABEL: load_v4i8_to_v4f32:
1231 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1232 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1233 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
1234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1235 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1236 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1238 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1239 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1240 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1241 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1242 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1243 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1244 ; GFX11-NEXT: s_nop 0
1245 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1246 ; GFX11-NEXT: s_endpgm
1247 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1248 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1249 %load = load <4 x i8>, ptr addrspace(1) %gep, align 4
1250 %cvt = uitofp <4 x i8> %load to <4 x float>
1251 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1255 ; This should not be adding instructions to shift into the correct
1256 ; position in the word for the component.
1258 ; FIXME: Packing bytes
1259 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1260 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
1262 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1263 ; SI-NEXT: s_mov_b32 s7, 0xf000
1264 ; SI-NEXT: s_mov_b32 s10, 0
1265 ; SI-NEXT: s_mov_b32 s11, s7
1266 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1267 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1268 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1269 ; SI-NEXT: v_mov_b32_e32 v1, 0
1270 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3
1271 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2
1272 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1
1273 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1274 ; SI-NEXT: s_mov_b32 s6, -1
1275 ; SI-NEXT: s_mov_b32 s4, s0
1276 ; SI-NEXT: s_mov_b32 s5, s1
1277 ; SI-NEXT: s_waitcnt vmcnt(3)
1278 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2
1279 ; SI-NEXT: s_waitcnt vmcnt(2)
1280 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1281 ; SI-NEXT: s_waitcnt vmcnt(1)
1282 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1283 ; SI-NEXT: s_waitcnt vmcnt(0)
1284 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1285 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1288 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
1290 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1291 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1292 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1293 ; VI-NEXT: v_mov_b32_e32 v1, s3
1294 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1295 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1296 ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1297 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1298 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
1299 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1300 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1301 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1302 ; VI-NEXT: flat_load_ubyte v4, v[0:1]
1303 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
1304 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1305 ; VI-NEXT: flat_load_ubyte v1, v[0:1]
1306 ; VI-NEXT: s_mov_b32 s3, 0xf000
1307 ; VI-NEXT: s_mov_b32 s2, -1
1308 ; VI-NEXT: s_waitcnt vmcnt(3)
1309 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1310 ; VI-NEXT: s_waitcnt vmcnt(2)
1311 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
1312 ; VI-NEXT: s_waitcnt vmcnt(1)
1313 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1314 ; VI-NEXT: s_waitcnt vmcnt(0)
1315 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1316 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1319 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
1321 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1322 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1323 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
1324 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1325 ; GFX10-NEXT: s_clause 0x3
1326 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3
1327 ; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2
1328 ; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1
1329 ; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7]
1330 ; GFX10-NEXT: s_waitcnt vmcnt(3)
1331 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1332 ; GFX10-NEXT: s_waitcnt vmcnt(2)
1333 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1334 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1335 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1336 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1337 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
1338 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5]
1339 ; GFX10-NEXT: s_endpgm
1341 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
1343 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1344 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1345 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
1346 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1347 ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3
1348 ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2
1349 ; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1
1350 ; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7]
1351 ; GFX9-NEXT: s_waitcnt vmcnt(3)
1352 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1353 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1354 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1355 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1356 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1357 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1358 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
1359 ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5]
1360 ; GFX9-NEXT: s_endpgm
1362 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
1364 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1365 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1366 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1367 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1368 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1369 ; GFX11-NEXT: s_clause 0x3
1370 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
1371 ; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
1372 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1
1373 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
1374 ; GFX11-NEXT: s_waitcnt vmcnt(3)
1375 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1376 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1377 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1378 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1379 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1380 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1381 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1382 ; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1]
1383 ; GFX11-NEXT: s_nop 0
1384 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1385 ; GFX11-NEXT: s_endpgm
1386 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1387 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1388 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1389 %cvt = uitofp <4 x i8> %load to <4 x float>
1390 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1394 ; The other use of shuffle0_0 make it profitable to lower into v_perm
1396 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind {
1397 ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1399 ; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
1400 ; SI-NEXT: s_mov_b32 s11, 0xf000
1401 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1402 ; SI-NEXT: v_mov_b32_e32 v1, 0
1403 ; SI-NEXT: s_mov_b32 s14, 0
1404 ; SI-NEXT: s_mov_b32 s15, s11
1405 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1406 ; SI-NEXT: s_mov_b64 s[12:13], s[4:5]
1407 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3
1408 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
1409 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1410 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
1411 ; SI-NEXT: s_mov_b32 s10, -1
1412 ; SI-NEXT: s_mov_b32 s8, s2
1413 ; SI-NEXT: s_mov_b32 s9, s3
1414 ; SI-NEXT: s_mov_b32 s2, s10
1415 ; SI-NEXT: s_mov_b32 s3, s11
1416 ; SI-NEXT: s_waitcnt vmcnt(2)
1417 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
1418 ; SI-NEXT: s_waitcnt vmcnt(1)
1419 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4
1420 ; SI-NEXT: v_or_b32_e32 v5, v5, v4
1421 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1422 ; SI-NEXT: s_waitcnt vmcnt(0)
1423 ; SI-NEXT: v_or_b32_e32 v6, v3, v6
1424 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
1425 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1426 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
1427 ; SI-NEXT: v_mov_b32_e32 v3, v1
1428 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1429 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
1430 ; SI-NEXT: v_or_b32_e32 v4, v4, v6
1431 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1432 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
1435 ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1437 ; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
1438 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1439 ; VI-NEXT: s_mov_b32 s8, 0x4000405
1440 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1441 ; VI-NEXT: v_mov_b32_e32 v1, s5
1442 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
1443 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1444 ; VI-NEXT: v_mov_b32_e32 v1, s7
1445 ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0
1446 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1447 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2
1448 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1449 ; VI-NEXT: flat_load_ubyte v6, v[0:1]
1450 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2
1451 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1452 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4
1453 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
1454 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4
1455 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1456 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1457 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1458 ; VI-NEXT: flat_load_ubyte v4, v[0:1]
1459 ; VI-NEXT: s_mov_b32 s7, 0xf000
1460 ; VI-NEXT: s_mov_b32 s6, -1
1461 ; VI-NEXT: s_mov_b32 s4, s2
1462 ; VI-NEXT: s_mov_b32 s5, s3
1463 ; VI-NEXT: s_mov_b32 s2, s6
1464 ; VI-NEXT: s_mov_b32 s3, s7
1465 ; VI-NEXT: s_waitcnt vmcnt(3)
1466 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
1467 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
1468 ; VI-NEXT: s_waitcnt vmcnt(2)
1469 ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
1470 ; VI-NEXT: s_waitcnt vmcnt(1)
1471 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
1472 ; VI-NEXT: s_waitcnt vmcnt(0)
1473 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1474 ; VI-NEXT: v_or_b32_e32 v4, v5, v4
1475 ; VI-NEXT: v_or_b32_e32 v5, v7, v3
1476 ; VI-NEXT: v_mov_b32_e32 v3, v1
1477 ; VI-NEXT: v_perm_b32 v4, v4, v5, s8
1478 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1479 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
1482 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1484 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
1485 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1486 ; GFX10-NEXT: v_mov_b32_e32 v7, 0
1487 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1488 ; GFX10-NEXT: s_clause 0x3
1489 ; GFX10-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2
1490 ; GFX10-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3
1491 ; GFX10-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3
1492 ; GFX10-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2
1493 ; GFX10-NEXT: s_waitcnt vmcnt(2)
1494 ; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1
1495 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1496 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1497 ; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4
1498 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1499 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
1500 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1501 ; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405
1502 ; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[4:5]
1503 ; GFX10-NEXT: global_store_dword v7, v4, s[6:7]
1504 ; GFX10-NEXT: s_endpgm
1506 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1508 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
1509 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1510 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
1511 ; GFX9-NEXT: s_mov_b32 s0, 0x4000405
1512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1513 ; GFX9-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2
1514 ; GFX9-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3
1515 ; GFX9-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3
1516 ; GFX9-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2
1517 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1518 ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1
1519 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1520 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1521 ; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4
1522 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1523 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
1524 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
1525 ; GFX9-NEXT: v_perm_b32 v4, v6, v7, s0
1526 ; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
1527 ; GFX9-NEXT: global_store_dword v5, v4, s[6:7]
1528 ; GFX9-NEXT: s_endpgm
1530 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1532 ; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
1533 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1534 ; GFX11-NEXT: v_mov_b32_e32 v6, 0
1535 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1536 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1537 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1538 ; GFX11-NEXT: s_clause 0x3
1539 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2
1540 ; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3
1541 ; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3
1542 ; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2
1543 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1544 ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1
1545 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1546 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1547 ; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0
1548 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
1549 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
1550 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
1551 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
1552 ; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405
1553 ; GFX11-NEXT: s_clause 0x1
1554 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
1555 ; GFX11-NEXT: global_store_b32 v6, v4, s[2:3]
1556 ; GFX11-NEXT: s_nop 0
1557 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1558 ; GFX11-NEXT: s_endpgm
1559 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1560 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1561 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1562 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1563 %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1
1564 %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
1565 %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
1566 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1567 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4
1571 ; FIXME: Need to handle non-uniform case for function below (load without gep).
1572 ; Instructions still emitted to repack bytes for add use.
1573 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
1574 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
1576 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
1577 ; SI-NEXT: s_mov_b32 s7, 0xf000
1578 ; SI-NEXT: s_mov_b32 s10, 0
1579 ; SI-NEXT: s_mov_b32 s11, s7
1580 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1581 ; SI-NEXT: v_mov_b32_e32 v1, 0
1582 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1583 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
1584 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1585 ; SI-NEXT: s_mov_b32 s6, -1
1586 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1587 ; SI-NEXT: s_mov_b32 s4, s2
1588 ; SI-NEXT: s_mov_b32 s5, s3
1589 ; SI-NEXT: s_mov_b32 s2, s6
1590 ; SI-NEXT: s_mov_b32 s3, s7
1591 ; SI-NEXT: s_waitcnt vmcnt(0)
1592 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
1593 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
1594 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
1595 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
1596 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
1597 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1598 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
1599 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1600 ; SI-NEXT: s_waitcnt expcnt(0)
1601 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4
1602 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
1603 ; SI-NEXT: v_and_b32_e32 v1, 0xff00, v5
1604 ; SI-NEXT: v_or_b32_e32 v0, v6, v0
1605 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
1606 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
1607 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
1608 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1609 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1610 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1611 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
1612 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1615 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
1617 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1618 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1619 ; VI-NEXT: s_mov_b32 s7, 0xf000
1620 ; VI-NEXT: s_mov_b32 s6, -1
1621 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00
1622 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1623 ; VI-NEXT: v_mov_b32_e32 v1, s1
1624 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1625 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1626 ; VI-NEXT: flat_load_dword v4, v[0:1]
1627 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1628 ; VI-NEXT: v_mov_b32_e32 v6, 9
1629 ; VI-NEXT: v_mov_b32_e32 v7, 0x900
1630 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1631 ; VI-NEXT: s_mov_b32 s4, s2
1632 ; VI-NEXT: s_mov_b32 s5, s3
1633 ; VI-NEXT: s_mov_b32 s2, s6
1634 ; VI-NEXT: s_mov_b32 s3, s7
1635 ; VI-NEXT: s_waitcnt vmcnt(0)
1636 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
1637 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
1638 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
1639 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1640 ; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v4
1641 ; VI-NEXT: v_add_u16_e32 v9, 9, v4
1642 ; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1643 ; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1644 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1646 ; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1647 ; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1648 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
1649 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1650 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1651 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1654 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
1656 ; GFX10-NEXT: s_clause 0x1
1657 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1658 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1659 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1660 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1661 ; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
1662 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1663 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1664 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0
1665 ; GFX10-NEXT: v_add_nc_u16 v4, v0, 9
1666 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v1
1667 ; GFX10-NEXT: v_add_nc_u16 v1, v1, 9
1668 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1669 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1670 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1671 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1672 ; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900
1673 ; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900
1674 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1675 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
1676 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1677 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1678 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1679 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
1680 ; GFX10-NEXT: global_store_dword v4, v5, s[6:7]
1681 ; GFX10-NEXT: s_endpgm
1683 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
1685 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1686 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1687 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1688 ; GFX9-NEXT: v_mov_b32_e32 v6, 9
1689 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
1690 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1691 ; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
1692 ; GFX9-NEXT: s_movk_i32 s0, 0xff00
1693 ; GFX9-NEXT: s_movk_i32 s1, 0x900
1694 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1695 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
1696 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
1697 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
1698 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1699 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
1700 ; GFX9-NEXT: v_add_u16_e32 v8, 9, v4
1701 ; GFX9-NEXT: v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1702 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1703 ; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
1704 ; GFX9-NEXT: s_nop 0
1705 ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1706 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1707 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0
1708 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1709 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
1710 ; GFX9-NEXT: global_store_dword v5, v0, s[6:7]
1711 ; GFX9-NEXT: s_endpgm
1713 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
1715 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
1716 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1717 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1718 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1719 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1720 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1721 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1722 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1723 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1724 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9
1725 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0
1726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1727 ; GFX11-NEXT: v_add_nc_u16 v3, v1, 9
1728 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
1729 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1730 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1731 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
1732 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v2
1733 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
1734 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1735 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
1736 ; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
1737 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1738 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1739 ; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
1740 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
1741 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1742 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1743 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
1744 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1745 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1746 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6
1747 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1748 ; GFX11-NEXT: s_clause 0x1
1749 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1750 ; GFX11-NEXT: global_store_b32 v4, v5, s[2:3]
1751 ; GFX11-NEXT: s_nop 0
1752 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1753 ; GFX11-NEXT: s_endpgm
1754 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
1755 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
1756 %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
1757 %cvt = uitofp <4 x i8> %load to <4 x float>
1758 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1759 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
1760 store <4 x i8> %add, ptr addrspace(1) %out2, align 4
1764 ; Make sure this doesn't crash.
1765 define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1766 ; SI-LABEL: load_v7i8_to_v7f32:
1768 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1769 ; SI-NEXT: s_mov_b32 s7, 0xf000
1770 ; SI-NEXT: s_mov_b32 s10, 0
1771 ; SI-NEXT: s_mov_b32 s11, s7
1772 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1773 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1774 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1775 ; SI-NEXT: v_mov_b32_e32 v1, 0
1776 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3
1777 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2
1778 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1
1779 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[8:11], 0 addr64
1780 ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[8:11], 0 addr64 offset:5
1781 ; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[8:11], 0 addr64 offset:4
1782 ; SI-NEXT: buffer_load_ubyte v9, v[0:1], s[8:11], 0 addr64 offset:6
1783 ; SI-NEXT: s_mov_b32 s6, -1
1784 ; SI-NEXT: s_mov_b32 s4, s0
1785 ; SI-NEXT: s_mov_b32 s5, s1
1786 ; SI-NEXT: s_waitcnt vmcnt(6)
1787 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2
1788 ; SI-NEXT: s_waitcnt vmcnt(5)
1789 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1790 ; SI-NEXT: s_waitcnt vmcnt(4)
1791 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1792 ; SI-NEXT: s_waitcnt vmcnt(3)
1793 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
1794 ; SI-NEXT: s_waitcnt vmcnt(2)
1795 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7
1796 ; SI-NEXT: s_waitcnt vmcnt(1)
1797 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
1798 ; SI-NEXT: s_waitcnt vmcnt(0)
1799 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9
1800 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24
1801 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
1802 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1805 ; VI-LABEL: load_v7i8_to_v7f32:
1807 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1808 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1809 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1810 ; VI-NEXT: v_mov_b32_e32 v1, s3
1811 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1812 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1813 ; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
1814 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1815 ; VI-NEXT: flat_load_ubyte v10, v[2:3]
1816 ; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
1817 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1818 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0
1819 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1820 ; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
1821 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1822 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0
1823 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1824 ; VI-NEXT: flat_load_ubyte v6, v[6:7]
1825 ; VI-NEXT: flat_load_ubyte v7, v[8:9]
1826 ; VI-NEXT: flat_load_ubyte v8, v[2:3]
1827 ; VI-NEXT: flat_load_ubyte v2, v[0:1]
1828 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
1829 ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
1830 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1831 ; VI-NEXT: flat_load_ubyte v9, v[0:1]
1832 ; VI-NEXT: s_mov_b32 s3, 0xf000
1833 ; VI-NEXT: s_mov_b32 s2, -1
1834 ; VI-NEXT: s_waitcnt vmcnt(6)
1835 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10
1836 ; VI-NEXT: s_waitcnt vmcnt(4)
1837 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7
1838 ; VI-NEXT: s_waitcnt vmcnt(2)
1839 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1840 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6
1841 ; VI-NEXT: s_waitcnt vmcnt(1)
1842 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1843 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
1844 ; VI-NEXT: s_waitcnt vmcnt(0)
1845 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
1846 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
1847 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1850 ; GFX10-LABEL: load_v7i8_to_v7f32:
1852 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1853 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1854 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
1855 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1856 ; GFX10-NEXT: s_clause 0x5
1857 ; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:6
1858 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3
1859 ; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2
1860 ; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] offset:1
1861 ; GFX10-NEXT: global_load_short_d16 v7, v0, s[6:7] offset:4
1862 ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7]
1863 ; GFX10-NEXT: s_waitcnt vmcnt(5)
1864 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1865 ; GFX10-NEXT: s_waitcnt vmcnt(4)
1866 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1867 ; GFX10-NEXT: s_waitcnt vmcnt(3)
1868 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1869 ; GFX10-NEXT: s_waitcnt vmcnt(2)
1870 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1871 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1872 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
1873 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
1874 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1875 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1876 ; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[4:5] offset:16
1877 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
1878 ; GFX10-NEXT: s_endpgm
1880 ; GFX9-LABEL: load_v7i8_to_v7f32:
1882 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1883 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1884 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1885 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1886 ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:6
1887 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4
1888 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3
1889 ; GFX9-NEXT: global_load_ubyte v7, v0, s[6:7] offset:2
1890 ; GFX9-NEXT: global_load_ubyte v8, v0, s[6:7] offset:1
1891 ; GFX9-NEXT: global_load_ubyte v9, v0, s[6:7]
1892 ; GFX9-NEXT: s_waitcnt vmcnt(5)
1893 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
1894 ; GFX9-NEXT: s_waitcnt vmcnt(4)
1895 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
1896 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
1897 ; GFX9-NEXT: s_waitcnt vmcnt(3)
1898 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
1899 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1900 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
1901 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1902 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8
1903 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1904 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
1905 ; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5]
1906 ; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[4:5] offset:16
1907 ; GFX9-NEXT: s_endpgm
1909 ; GFX11-LABEL: load_v7i8_to_v7f32:
1911 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1912 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1913 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
1914 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1915 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1916 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1917 ; GFX11-NEXT: s_clause 0x5
1918 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6
1919 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
1920 ; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
1921 ; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
1922 ; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4
1923 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
1924 ; GFX11-NEXT: s_waitcnt vmcnt(5)
1925 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1926 ; GFX11-NEXT: s_waitcnt vmcnt(4)
1927 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1928 ; GFX11-NEXT: s_waitcnt vmcnt(3)
1929 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1930 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1931 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1932 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1933 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
1934 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
1935 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1936 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1937 ; GFX11-NEXT: s_clause 0x1
1938 ; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16
1939 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
1940 ; GFX11-NEXT: s_nop 0
1941 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1942 ; GFX11-NEXT: s_endpgm
1943 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1944 %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
1945 %load = load <7 x i8>, ptr addrspace(1) %gep, align 1
1946 %cvt = uitofp <7 x i8> %load to <7 x float>
1947 store <7 x float> %cvt, ptr addrspace(1) %out, align 16
1951 define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1952 ; SI-LABEL: load_v8i8_to_v8f32:
1954 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1955 ; SI-NEXT: s_mov_b32 s7, 0xf000
1956 ; SI-NEXT: s_mov_b32 s10, 0
1957 ; SI-NEXT: s_mov_b32 s11, s7
1958 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1959 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1960 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1961 ; SI-NEXT: v_mov_b32_e32 v1, 0
1962 ; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[8:11], 0 addr64
1963 ; SI-NEXT: s_mov_b32 s6, -1
1964 ; SI-NEXT: s_mov_b32 s4, s0
1965 ; SI-NEXT: s_mov_b32 s5, s1
1966 ; SI-NEXT: s_waitcnt vmcnt(0)
1967 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
1968 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
1969 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
1970 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1971 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
1972 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
1973 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
1974 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
1975 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1976 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1979 ; VI-LABEL: load_v8i8_to_v8f32:
1981 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1982 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1983 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1984 ; VI-NEXT: v_mov_b32_e32 v1, s3
1985 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1986 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1987 ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1]
1988 ; VI-NEXT: s_mov_b32 s3, 0xf000
1989 ; VI-NEXT: s_mov_b32 s2, -1
1990 ; VI-NEXT: s_waitcnt vmcnt(0)
1991 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
1992 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
1993 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
1994 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1995 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
1996 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
1997 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
1998 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
1999 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2000 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2003 ; GFX10-LABEL: load_v8i8_to_v8f32:
2005 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2006 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2007 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
2008 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2009 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[6:7]
2010 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2011 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9
2012 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9
2013 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9
2014 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
2015 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8
2016 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8
2017 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8
2018 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8
2019 ; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[4:5] offset:16
2020 ; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5]
2021 ; GFX10-NEXT: s_endpgm
2023 ; GFX9-LABEL: load_v8i8_to_v8f32:
2025 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2026 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2027 ; GFX9-NEXT: v_mov_b32_e32 v9, 0
2028 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2029 ; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[6:7]
2030 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2031 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
2032 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
2033 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
2034 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
2035 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
2036 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
2037 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
2038 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
2039 ; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[4:5] offset:16
2040 ; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5]
2041 ; GFX9-NEXT: s_endpgm
2043 ; GFX11-LABEL: load_v8i8_to_v8f32:
2045 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2046 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2047 ; GFX11-NEXT: v_mov_b32_e32 v10, 0
2048 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2049 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2050 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2051 ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3]
2052 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2053 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v7, v9
2054 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v6, v9
2055 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v9
2056 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
2057 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v8
2058 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v8
2059 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v8
2060 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v8
2061 ; GFX11-NEXT: s_clause 0x1
2062 ; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16
2063 ; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1]
2064 ; GFX11-NEXT: s_nop 0
2065 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2066 ; GFX11-NEXT: s_endpgm
2067 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2068 %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
2069 %load = load <8 x i8>, ptr addrspace(1) %gep, align 8
2070 %cvt = uitofp <8 x i8> %load to <8 x float>
2071 store <8 x float> %cvt, ptr addrspace(1) %out, align 16
2075 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2076 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
2078 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2079 ; SI-NEXT: s_mov_b32 s7, 0xf000
2080 ; SI-NEXT: s_mov_b32 s10, 0
2081 ; SI-NEXT: s_mov_b32 s11, s7
2082 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2083 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2084 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2085 ; SI-NEXT: v_mov_b32_e32 v1, 0
2086 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2087 ; SI-NEXT: s_mov_b32 s6, -1
2088 ; SI-NEXT: s_mov_b32 s4, s0
2089 ; SI-NEXT: s_mov_b32 s5, s1
2090 ; SI-NEXT: s_waitcnt vmcnt(0)
2091 ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
2092 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2093 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2096 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
2098 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2099 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2101 ; VI-NEXT: v_mov_b32_e32 v1, s3
2102 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2103 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2104 ; VI-NEXT: flat_load_dword v0, v[0:1]
2105 ; VI-NEXT: s_mov_b32 s3, 0xf000
2106 ; VI-NEXT: s_mov_b32 s2, -1
2107 ; VI-NEXT: s_waitcnt vmcnt(0)
2108 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
2109 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2110 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2113 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
2115 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2116 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2117 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2119 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
2120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2121 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
2122 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2123 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2124 ; GFX10-NEXT: s_endpgm
2126 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
2128 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2129 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2130 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2131 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2132 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
2133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2134 ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0
2135 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2136 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2137 ; GFX9-NEXT: s_endpgm
2139 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
2141 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2142 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2143 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2144 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2145 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2146 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2147 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2148 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
2149 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2150 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2151 ; GFX11-NEXT: s_nop 0
2152 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2153 ; GFX11-NEXT: s_endpgm
2154 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2155 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2156 %load = load i32, ptr addrspace(1) %gep, align 4
2157 %add = add i32 %load, 2
2158 %inreg = and i32 %add, 255
2159 %cvt = uitofp i32 %inreg to float
2160 store float %cvt, ptr addrspace(1) %out, align 4
2164 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2165 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
2167 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2168 ; SI-NEXT: s_mov_b32 s7, 0xf000
2169 ; SI-NEXT: s_mov_b32 s10, 0
2170 ; SI-NEXT: s_mov_b32 s11, s7
2171 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2172 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2173 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2174 ; SI-NEXT: v_mov_b32_e32 v1, 0
2175 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2176 ; SI-NEXT: s_mov_b32 s6, -1
2177 ; SI-NEXT: s_mov_b32 s4, s0
2178 ; SI-NEXT: s_mov_b32 s5, s1
2179 ; SI-NEXT: s_waitcnt vmcnt(0)
2180 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2181 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2184 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
2186 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2187 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2188 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2189 ; VI-NEXT: v_mov_b32_e32 v1, s3
2190 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2191 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2192 ; VI-NEXT: flat_load_dword v0, v[0:1]
2193 ; VI-NEXT: s_mov_b32 s3, 0xf000
2194 ; VI-NEXT: s_mov_b32 s2, -1
2195 ; VI-NEXT: s_waitcnt vmcnt(0)
2196 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2197 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2200 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
2202 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2203 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2204 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2205 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2206 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
2207 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2208 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2209 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2210 ; GFX10-NEXT: s_endpgm
2212 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
2214 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2215 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2216 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2217 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2218 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
2219 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2220 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2221 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2222 ; GFX9-NEXT: s_endpgm
2224 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
2226 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2227 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2228 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2229 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2230 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2231 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2232 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2233 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2234 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2235 ; GFX11-NEXT: s_nop 0
2236 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2237 ; GFX11-NEXT: s_endpgm
2238 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2239 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2240 %load = load i32, ptr addrspace(1) %gep, align 4
2241 %inreg = and i32 %load, 65280
2242 %shr = lshr i32 %inreg, 8
2243 %cvt = uitofp i32 %shr to float
2244 store float %cvt, ptr addrspace(1) %out, align 4
2248 ; We don't get these ones because of the zext, but instcombine removes
2249 ; them so it shouldn't really matter.
2250 define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2251 ; SI-LABEL: i8_zext_i32_to_f32:
2253 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2254 ; SI-NEXT: s_mov_b32 s7, 0xf000
2255 ; SI-NEXT: v_mov_b32_e32 v1, 0
2256 ; SI-NEXT: s_mov_b32 s10, 0
2257 ; SI-NEXT: s_mov_b32 s11, s7
2258 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2259 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2260 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
2261 ; SI-NEXT: s_mov_b32 s6, -1
2262 ; SI-NEXT: s_mov_b32 s4, s0
2263 ; SI-NEXT: s_mov_b32 s5, s1
2264 ; SI-NEXT: s_waitcnt vmcnt(0)
2265 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2266 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2269 ; VI-LABEL: i8_zext_i32_to_f32:
2271 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2272 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2273 ; VI-NEXT: v_mov_b32_e32 v1, s3
2274 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2275 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2276 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
2277 ; VI-NEXT: s_mov_b32 s3, 0xf000
2278 ; VI-NEXT: s_mov_b32 s2, -1
2279 ; VI-NEXT: s_waitcnt vmcnt(0)
2280 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2281 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2284 ; GFX10-LABEL: i8_zext_i32_to_f32:
2286 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2287 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2288 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2289 ; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7]
2290 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2291 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2292 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2293 ; GFX10-NEXT: s_endpgm
2295 ; GFX9-LABEL: i8_zext_i32_to_f32:
2297 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2298 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2299 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2300 ; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7]
2301 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2302 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2303 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2304 ; GFX9-NEXT: s_endpgm
2306 ; GFX11-LABEL: i8_zext_i32_to_f32:
2308 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2309 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2310 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2311 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
2312 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2313 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2314 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2315 ; GFX11-NEXT: s_nop 0
2316 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2317 ; GFX11-NEXT: s_endpgm
2318 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2319 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
2320 %load = load i8, ptr addrspace(1) %gep, align 1
2321 %ext = zext i8 %load to i32
2322 %cvt = uitofp i32 %ext to float
2323 store float %cvt, ptr addrspace(1) %out, align 4
2327 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2328 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
2330 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2331 ; SI-NEXT: s_mov_b32 s7, 0xf000
2332 ; SI-NEXT: s_mov_b32 s10, 0
2333 ; SI-NEXT: s_mov_b32 s11, s7
2334 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2336 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2337 ; SI-NEXT: v_mov_b32_e32 v1, 0
2338 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3
2339 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2
2340 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1
2341 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
2342 ; SI-NEXT: s_mov_b32 s6, -1
2343 ; SI-NEXT: s_mov_b32 s4, s0
2344 ; SI-NEXT: s_mov_b32 s5, s1
2345 ; SI-NEXT: s_waitcnt vmcnt(3)
2346 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2
2347 ; SI-NEXT: s_waitcnt vmcnt(2)
2348 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
2349 ; SI-NEXT: s_waitcnt vmcnt(1)
2350 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
2351 ; SI-NEXT: s_waitcnt vmcnt(0)
2352 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2353 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2356 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
2358 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2359 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2360 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2361 ; VI-NEXT: v_mov_b32_e32 v1, s3
2362 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2363 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2364 ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2365 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2366 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
2367 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2368 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
2369 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
2370 ; VI-NEXT: flat_load_ubyte v4, v[0:1]
2371 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
2372 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2373 ; VI-NEXT: flat_load_ubyte v1, v[0:1]
2374 ; VI-NEXT: s_mov_b32 s3, 0xf000
2375 ; VI-NEXT: s_mov_b32 s2, -1
2376 ; VI-NEXT: s_waitcnt vmcnt(3)
2377 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
2378 ; VI-NEXT: s_waitcnt vmcnt(2)
2379 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
2380 ; VI-NEXT: s_waitcnt vmcnt(1)
2381 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
2382 ; VI-NEXT: s_waitcnt vmcnt(0)
2383 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
2384 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2387 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
2389 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2390 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2391 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
2392 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2393 ; GFX10-NEXT: s_clause 0x3
2394 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3
2395 ; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2
2396 ; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1
2397 ; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7]
2398 ; GFX10-NEXT: s_waitcnt vmcnt(3)
2399 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
2400 ; GFX10-NEXT: s_waitcnt vmcnt(2)
2401 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
2402 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2403 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
2404 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2405 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
2406 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5]
2407 ; GFX10-NEXT: s_endpgm
2409 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
2411 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2412 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2413 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
2414 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2415 ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3
2416 ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2
2417 ; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1
2418 ; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7]
2419 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2420 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
2421 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2422 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
2423 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2424 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
2425 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2426 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
2427 ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5]
2428 ; GFX9-NEXT: s_endpgm
2430 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
2432 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2433 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2434 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2435 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2436 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2437 ; GFX11-NEXT: s_clause 0x3
2438 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
2439 ; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
2440 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1
2441 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
2442 ; GFX11-NEXT: s_waitcnt vmcnt(3)
2443 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
2444 ; GFX11-NEXT: s_waitcnt vmcnt(2)
2445 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
2446 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2447 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
2448 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2449 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2450 ; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1]
2451 ; GFX11-NEXT: s_nop 0
2452 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2453 ; GFX11-NEXT: s_endpgm
2454 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2455 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
2456 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
2457 %ext = zext <4 x i8> %load to <4 x i32>
2458 %cvt = uitofp <4 x i32> %ext to <4 x float>
2459 store <4 x float> %cvt, ptr addrspace(1) %out, align 16
2463 define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2464 ; SI-LABEL: extract_byte0_to_f32:
2466 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2467 ; SI-NEXT: s_mov_b32 s7, 0xf000
2468 ; SI-NEXT: s_mov_b32 s10, 0
2469 ; SI-NEXT: s_mov_b32 s11, s7
2470 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2471 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2472 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2473 ; SI-NEXT: v_mov_b32_e32 v1, 0
2474 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2475 ; SI-NEXT: s_mov_b32 s6, -1
2476 ; SI-NEXT: s_mov_b32 s4, s0
2477 ; SI-NEXT: s_mov_b32 s5, s1
2478 ; SI-NEXT: s_waitcnt vmcnt(0)
2479 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2480 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2483 ; VI-LABEL: extract_byte0_to_f32:
2485 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2486 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2487 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2488 ; VI-NEXT: v_mov_b32_e32 v1, s3
2489 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2490 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2491 ; VI-NEXT: flat_load_dword v0, v[0:1]
2492 ; VI-NEXT: s_mov_b32 s3, 0xf000
2493 ; VI-NEXT: s_mov_b32 s2, -1
2494 ; VI-NEXT: s_waitcnt vmcnt(0)
2495 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2496 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2499 ; GFX10-LABEL: extract_byte0_to_f32:
2501 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2502 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2503 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2504 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2505 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
2506 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2507 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2508 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2509 ; GFX10-NEXT: s_endpgm
2511 ; GFX9-LABEL: extract_byte0_to_f32:
2513 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2514 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2515 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2516 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2517 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
2518 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2519 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2520 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2521 ; GFX9-NEXT: s_endpgm
2523 ; GFX11-LABEL: extract_byte0_to_f32:
2525 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2526 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2527 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2528 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2529 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2530 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2531 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2532 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2533 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2534 ; GFX11-NEXT: s_nop 0
2535 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2536 ; GFX11-NEXT: s_endpgm
2537 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2538 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2539 %val = load i32, ptr addrspace(1) %gep
2540 %and = and i32 %val, 255
2541 %cvt = uitofp i32 %and to float
2542 store float %cvt, ptr addrspace(1) %out
2546 define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2547 ; SI-LABEL: extract_byte1_to_f32:
2549 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2550 ; SI-NEXT: s_mov_b32 s7, 0xf000
2551 ; SI-NEXT: s_mov_b32 s10, 0
2552 ; SI-NEXT: s_mov_b32 s11, s7
2553 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2554 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2555 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2556 ; SI-NEXT: v_mov_b32_e32 v1, 0
2557 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2558 ; SI-NEXT: s_mov_b32 s6, -1
2559 ; SI-NEXT: s_mov_b32 s4, s0
2560 ; SI-NEXT: s_mov_b32 s5, s1
2561 ; SI-NEXT: s_waitcnt vmcnt(0)
2562 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2563 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2566 ; VI-LABEL: extract_byte1_to_f32:
2568 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2569 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2570 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2571 ; VI-NEXT: v_mov_b32_e32 v1, s3
2572 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2573 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2574 ; VI-NEXT: flat_load_dword v0, v[0:1]
2575 ; VI-NEXT: s_mov_b32 s3, 0xf000
2576 ; VI-NEXT: s_mov_b32 s2, -1
2577 ; VI-NEXT: s_waitcnt vmcnt(0)
2578 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2579 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2582 ; GFX10-LABEL: extract_byte1_to_f32:
2584 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2585 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2586 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2587 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2588 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
2589 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2590 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2591 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2592 ; GFX10-NEXT: s_endpgm
2594 ; GFX9-LABEL: extract_byte1_to_f32:
2596 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2597 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2598 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2599 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2600 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
2601 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2602 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2603 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2604 ; GFX9-NEXT: s_endpgm
2606 ; GFX11-LABEL: extract_byte1_to_f32:
2608 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2609 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2610 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2611 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2612 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2613 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2614 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2615 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
2616 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2617 ; GFX11-NEXT: s_nop 0
2618 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2619 ; GFX11-NEXT: s_endpgm
2620 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2621 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2622 %val = load i32, ptr addrspace(1) %gep
2623 %srl = lshr i32 %val, 8
2624 %and = and i32 %srl, 255
2625 %cvt = uitofp i32 %and to float
2626 store float %cvt, ptr addrspace(1) %out
2630 define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2631 ; SI-LABEL: extract_byte2_to_f32:
2633 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2634 ; SI-NEXT: s_mov_b32 s7, 0xf000
2635 ; SI-NEXT: s_mov_b32 s10, 0
2636 ; SI-NEXT: s_mov_b32 s11, s7
2637 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2638 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2639 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2640 ; SI-NEXT: v_mov_b32_e32 v1, 0
2641 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2642 ; SI-NEXT: s_mov_b32 s6, -1
2643 ; SI-NEXT: s_mov_b32 s4, s0
2644 ; SI-NEXT: s_mov_b32 s5, s1
2645 ; SI-NEXT: s_waitcnt vmcnt(0)
2646 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
2647 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2650 ; VI-LABEL: extract_byte2_to_f32:
2652 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2653 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2654 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2655 ; VI-NEXT: v_mov_b32_e32 v1, s3
2656 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2657 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2658 ; VI-NEXT: flat_load_dword v0, v[0:1]
2659 ; VI-NEXT: s_mov_b32 s3, 0xf000
2660 ; VI-NEXT: s_mov_b32 s2, -1
2661 ; VI-NEXT: s_waitcnt vmcnt(0)
2662 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
2663 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2666 ; GFX10-LABEL: extract_byte2_to_f32:
2668 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2669 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2670 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2671 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2672 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
2673 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2674 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
2675 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2676 ; GFX10-NEXT: s_endpgm
2678 ; GFX9-LABEL: extract_byte2_to_f32:
2680 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2681 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2682 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2683 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2684 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
2685 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2686 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
2687 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2688 ; GFX9-NEXT: s_endpgm
2690 ; GFX11-LABEL: extract_byte2_to_f32:
2692 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2693 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2694 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2695 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2696 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2697 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2698 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2699 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
2700 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2701 ; GFX11-NEXT: s_nop 0
2702 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2703 ; GFX11-NEXT: s_endpgm
2704 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2705 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2706 %val = load i32, ptr addrspace(1) %gep
2707 %srl = lshr i32 %val, 16
2708 %and = and i32 %srl, 255
2709 %cvt = uitofp i32 %and to float
2710 store float %cvt, ptr addrspace(1) %out
2714 define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2715 ; SI-LABEL: extract_byte3_to_f32:
2717 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2718 ; SI-NEXT: s_mov_b32 s7, 0xf000
2719 ; SI-NEXT: s_mov_b32 s10, 0
2720 ; SI-NEXT: s_mov_b32 s11, s7
2721 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2722 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2723 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2724 ; SI-NEXT: v_mov_b32_e32 v1, 0
2725 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2726 ; SI-NEXT: s_mov_b32 s6, -1
2727 ; SI-NEXT: s_mov_b32 s4, s0
2728 ; SI-NEXT: s_mov_b32 s5, s1
2729 ; SI-NEXT: s_waitcnt vmcnt(0)
2730 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
2731 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2734 ; VI-LABEL: extract_byte3_to_f32:
2736 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2737 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2738 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2739 ; VI-NEXT: v_mov_b32_e32 v1, s3
2740 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2741 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2742 ; VI-NEXT: flat_load_dword v0, v[0:1]
2743 ; VI-NEXT: s_mov_b32 s3, 0xf000
2744 ; VI-NEXT: s_mov_b32 s2, -1
2745 ; VI-NEXT: s_waitcnt vmcnt(0)
2746 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
2747 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2750 ; GFX10-LABEL: extract_byte3_to_f32:
2752 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2753 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2754 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2755 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2756 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
2757 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2758 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
2759 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2760 ; GFX10-NEXT: s_endpgm
2762 ; GFX9-LABEL: extract_byte3_to_f32:
2764 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2765 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2766 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2767 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2768 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
2769 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2770 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
2771 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
2772 ; GFX9-NEXT: s_endpgm
2774 ; GFX11-LABEL: extract_byte3_to_f32:
2776 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2777 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2778 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2779 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2780 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2781 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2782 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2783 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
2784 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
2785 ; GFX11-NEXT: s_nop 0
2786 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2787 ; GFX11-NEXT: s_endpgm
2788 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2789 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2790 %val = load i32, ptr addrspace(1) %gep
2791 %srl = lshr i32 %val, 24
2792 %and = and i32 %srl, 255
2793 %cvt = uitofp i32 %and to float
2794 store float %cvt, ptr addrspace(1) %out
2798 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2799 ; SI-LABEL: cvt_ubyte0_or_multiuse:
2800 ; SI: ; %bb.0: ; %bb
2801 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2802 ; SI-NEXT: s_mov_b32 s7, 0xf000
2803 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2804 ; SI-NEXT: v_mov_b32_e32 v1, 0
2805 ; SI-NEXT: s_mov_b32 s6, -1
2806 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2807 ; SI-NEXT: s_mov_b32 s4, s2
2808 ; SI-NEXT: s_mov_b32 s5, s3
2809 ; SI-NEXT: s_mov_b32 s2, 0
2810 ; SI-NEXT: s_mov_b32 s3, s7
2811 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
2812 ; SI-NEXT: s_waitcnt vmcnt(0)
2813 ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
2814 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
2815 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
2816 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2819 ; VI-LABEL: cvt_ubyte0_or_multiuse:
2820 ; VI: ; %bb.0: ; %bb
2821 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2822 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2823 ; VI-NEXT: s_mov_b32 s7, 0xf000
2824 ; VI-NEXT: s_mov_b32 s6, -1
2825 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2826 ; VI-NEXT: v_mov_b32_e32 v1, s1
2827 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2828 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2829 ; VI-NEXT: flat_load_dword v0, v[0:1]
2830 ; VI-NEXT: s_mov_b32 s4, s2
2831 ; VI-NEXT: s_mov_b32 s5, s3
2832 ; VI-NEXT: s_waitcnt vmcnt(0)
2833 ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
2834 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
2835 ; VI-NEXT: v_add_f32_e32 v0, v0, v1
2836 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2839 ; GFX10-LABEL: cvt_ubyte0_or_multiuse:
2840 ; GFX10: ; %bb.0: ; %bb
2841 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2842 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2843 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2844 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2845 ; GFX10-NEXT: global_load_dword v0, v0, s[4:5]
2846 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2847 ; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0
2848 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
2849 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
2850 ; GFX10-NEXT: global_store_dword v2, v0, s[6:7]
2851 ; GFX10-NEXT: s_endpgm
2853 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
2854 ; GFX9: ; %bb.0: ; %bb
2855 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2856 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2857 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2858 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2859 ; GFX9-NEXT: global_load_dword v0, v0, s[4:5]
2860 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2861 ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0
2862 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
2863 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
2864 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
2865 ; GFX9-NEXT: s_endpgm
2867 ; GFX11-LABEL: cvt_ubyte0_or_multiuse:
2868 ; GFX11: ; %bb.0: ; %bb
2869 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2870 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2871 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2872 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2873 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2874 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2875 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
2876 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2877 ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0
2878 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
2879 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2880 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
2881 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
2882 ; GFX11-NEXT: s_nop 0
2883 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2884 ; GFX11-NEXT: s_endpgm
2886 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
2887 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid
2888 %load = load i32, ptr addrspace(1) %gep
2889 %or = or i32 %load, -2147483647
2890 %and = and i32 %or, 255
2891 %uitofp = uitofp i32 %and to float
2892 %cast = bitcast i32 %or to float
2893 %add = fadd float %cast, %uitofp
2894 store float %add, ptr addrspace(1) %out
2898 %Vec = type { [4 x i8] }
2900 define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
2901 ; SI-LABEL: cvt_f32_ubyte0_vector:
2902 ; SI: ; %bb.0: ; %entry
2903 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
2904 ; SI-NEXT: s_mov_b32 s3, 0xf000
2905 ; SI-NEXT: s_mov_b32 s2, -1
2906 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2907 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
2908 ; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2
2909 ; SI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1
2910 ; SI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
2911 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
2912 ; SI-NEXT: s_waitcnt vmcnt(3)
2913 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2914 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2915 ; SI-NEXT: v_fma_f32 v0, s0, v0, 0.5
2916 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
2917 ; SI-NEXT: s_waitcnt vmcnt(2)
2918 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
2919 ; SI-NEXT: s_waitcnt vmcnt(2)
2920 ; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0
2921 ; SI-NEXT: s_waitcnt vmcnt(2)
2922 ; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0
2923 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
2926 ; VI-LABEL: cvt_f32_ubyte0_vector:
2927 ; VI: ; %bb.0: ; %entry
2928 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
2929 ; VI-NEXT: s_mov_b32 s3, 0xf000
2930 ; VI-NEXT: s_mov_b32 s2, -1
2931 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2932 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
2933 ; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2
2934 ; VI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1
2935 ; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
2936 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
2937 ; VI-NEXT: s_waitcnt vmcnt(3)
2938 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
2939 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2940 ; VI-NEXT: v_mul_f32_e32 v0, s0, v0
2941 ; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
2942 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
2943 ; VI-NEXT: s_waitcnt vmcnt(2)
2944 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0
2945 ; VI-NEXT: s_waitcnt vmcnt(2)
2946 ; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0
2947 ; VI-NEXT: s_waitcnt vmcnt(2)
2948 ; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0
2949 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
2952 ; GFX10-LABEL: cvt_f32_ubyte0_vector:
2953 ; GFX10: ; %bb.0: ; %entry
2954 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
2955 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2956 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2957 ; GFX10-NEXT: s_clause 0x3
2958 ; GFX10-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3
2959 ; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2
2960 ; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
2961 ; GFX10-NEXT: global_load_ubyte v4, v0, s[0:1]
2962 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2963 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0
2964 ; GFX10-NEXT: s_waitcnt vmcnt(3)
2965 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
2966 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2967 ; GFX10-NEXT: v_fma_f32 v0, s0, v0, 0.5
2968 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
2969 ; GFX10-NEXT: s_waitcnt vmcnt(2)
2970 ; GFX10-NEXT: global_store_byte v[0:1], v2, off
2971 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2972 ; GFX10-NEXT: global_store_byte v[0:1], v3, off
2973 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2974 ; GFX10-NEXT: global_store_byte v[0:1], v4, off
2975 ; GFX10-NEXT: global_store_byte v[0:1], v0, off
2976 ; GFX10-NEXT: s_endpgm
2978 ; GFX9-LABEL: cvt_f32_ubyte0_vector:
2979 ; GFX9: ; %bb.0: ; %entry
2980 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2981 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2982 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3
2983 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
2984 ; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2
2985 ; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1
2986 ; GFX9-NEXT: global_load_ubyte v6, v[0:1], off
2987 ; GFX9-NEXT: s_waitcnt vmcnt(4)
2988 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
2989 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2990 ; GFX9-NEXT: v_fma_f32 v0, v3, v0, 0.5
2991 ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
2992 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2993 ; GFX9-NEXT: global_store_byte v[0:1], v4, off
2994 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2995 ; GFX9-NEXT: global_store_byte v[0:1], v5, off
2996 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2997 ; GFX9-NEXT: global_store_byte v[0:1], v6, off
2998 ; GFX9-NEXT: global_store_byte v[0:1], v0, off
2999 ; GFX9-NEXT: s_endpgm
3001 ; GFX11-LABEL: cvt_f32_ubyte0_vector:
3002 ; GFX11: ; %bb.0: ; %entry
3003 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3004 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3006 ; GFX11-NEXT: s_clause 0x3
3007 ; GFX11-NEXT: global_load_u8 v1, v0, s[0:1] offset:3
3008 ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] offset:2
3009 ; GFX11-NEXT: global_load_u8 v3, v0, s[0:1] offset:1
3010 ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
3011 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
3012 ; GFX11-NEXT: s_waitcnt vmcnt(3)
3013 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
3014 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3015 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3016 ; GFX11-NEXT: v_fma_f32 v1, s0, v1, 0.5
3017 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
3018 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3019 ; GFX11-NEXT: s_clause 0x3
3020 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off
3021 ; GFX11-NEXT: global_store_b8 v[0:1], v3, off
3022 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off
3023 ; GFX11-NEXT: global_store_b8 v[0:1], v1, off
3024 ; GFX11-NEXT: s_nop 0
3025 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3026 ; GFX11-NEXT: s_endpgm
3028 br label %for.body.i
3030 for.body.i: ; preds = %for.body.i, %entry
3031 %retval.sroa.0.0.copyload = load ptr, ptr addrspace(1) undef, align 8
3032 %add.ptr = getelementptr inbounds %Vec, ptr %retval.sroa.0.0.copyload, i64 undef
3033 %retval.sroa.0.0..sroa_cast_adr = addrspacecast ptr %add.ptr to ptr addrspace(1)
3034 %retval.sroa.0.0.copyload.i = load i32, ptr addrspace(1) %retval.sroa.0.0..sroa_cast_adr, align 1
3035 %p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i, 24
3036 %p1.sroa.6.0.extract.trunc = trunc i32 %p1.sroa.6.0.extract.shift to i8
3037 %conv12 = uitofp i8 %p1.sroa.6.0.extract.trunc to float
3038 %0 = load float, ptr addrspace(1) undef, align 8
3039 %mul = fmul contract float %0, %conv12
3040 %add = fadd contract float %mul, 5.000000e-01
3041 %conv13 = fptoui float %add to i8
3042 %retval.sroa.4.0.insert.ext = zext i8 %conv13 to i32
3043 %retval.sroa.4.0.insert.shift = shl nuw i32 %retval.sroa.4.0.insert.ext, 24
3044 %retval.sroa.3.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 16711680
3045 %retval.sroa.3.0.insert.insert = or i32 %retval.sroa.4.0.insert.shift, %retval.sroa.3.0.insert.ext
3046 %retval.sroa.2.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 65280
3047 %retval.sroa.2.0.insert.insert = or i32 %retval.sroa.3.0.insert.insert, %retval.sroa.2.0.insert.ext
3048 %retval.sroa.0.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 255
3049 %retval.sroa.0.0.insert.insert = or i32 %retval.sroa.2.0.insert.insert, %retval.sroa.0.0.insert.ext
3050 store i32 %retval.sroa.0.0.insert.insert, ptr addrspace(1) undef, align 1
3054 !llvm.module.flags = !{!0}
3055 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}