1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
6 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
7 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
9 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
10 ; GCN-LABEL: v_uitofp_i32_to_f32_mask255:
12 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
14 ; GCN-NEXT: s_setpc_b64 s[30:31]
16 ; GFX10-LABEL: v_uitofp_i32_to_f32_mask255:
18 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
20 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
21 ; GFX10-NEXT: s_setpc_b64 s[30:31]
22 %masked = and i32 %arg0, 255
23 %cvt = uitofp i32 %masked to float
27 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
28 ; GCN-LABEL: v_sitofp_i32_to_f32_mask255:
30 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
32 ; GCN-NEXT: s_setpc_b64 s[30:31]
34 ; GFX10-LABEL: v_sitofp_i32_to_f32_mask255:
36 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
38 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
39 ; GFX10-NEXT: s_setpc_b64 s[30:31]
40 %masked = and i32 %arg0, 255
41 %cvt = sitofp i32 %masked to float
45 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
46 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
48 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0
50 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
51 ; GCN-NEXT: s_setpc_b64 s[30:31]
53 ; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255:
55 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
57 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0
58 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
59 ; GFX10-NEXT: s_setpc_b64 s[30:31]
60 %lshr.7 = lshr i32 %arg0, 7
61 %masked = and i32 %lshr.7, 255
62 %cvt = uitofp i32 %masked to float
66 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
67 ; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255:
69 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
71 ; GCN-NEXT: s_setpc_b64 s[30:31]
73 ; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255:
75 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
77 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
78 ; GFX10-NEXT: s_setpc_b64 s[30:31]
79 %lshr.8 = lshr i32 %arg0, 8
80 %masked = and i32 %lshr.8, 255
81 %cvt = uitofp i32 %masked to float
85 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
86 ; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
88 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
90 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
91 ; SI-NEXT: s_mov_b32 s7, 0xf000
92 ; SI-NEXT: s_mov_b32 s6, -1
93 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
94 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
95 ; SI-NEXT: s_setpc_b64 s[30:31]
97 ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
99 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
101 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
102 ; VI-NEXT: s_mov_b32 s7, 0xf000
103 ; VI-NEXT: s_mov_b32 s6, -1
104 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
105 ; VI-NEXT: s_waitcnt vmcnt(0)
106 ; VI-NEXT: s_setpc_b64 s[30:31]
108 ; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
110 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
112 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
113 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
114 ; GFX10-NEXT: global_store_dword v[0:1], v1, off
115 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
116 ; GFX10-NEXT: s_setpc_b64 s[30:31]
117 %lshr.8 = lshr i32 %arg0, 8
118 store i32 %lshr.8, i32 addrspace(1)* undef
119 %masked = and i32 %lshr.8, 255
120 %cvt = uitofp i32 %masked to float
124 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
125 ; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255:
127 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
129 ; GCN-NEXT: s_setpc_b64 s[30:31]
131 ; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255:
133 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
135 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
136 ; GFX10-NEXT: s_setpc_b64 s[30:31]
137 %lshr.16 = lshr i32 %arg0, 16
138 %masked = and i32 %lshr.16, 255
139 %cvt = uitofp i32 %masked to float
143 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
144 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
146 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
148 ; GCN-NEXT: s_setpc_b64 s[30:31]
150 ; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255:
152 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
154 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
155 ; GFX10-NEXT: s_setpc_b64 s[30:31]
156 %lshr.16 = lshr i32 %arg0, 24
157 %masked = and i32 %lshr.16, 255
158 %cvt = uitofp i32 %masked to float
162 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
163 ; GCN-LABEL: v_uitofp_i8_to_f32:
165 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
167 ; GCN-NEXT: s_setpc_b64 s[30:31]
169 ; GFX10-LABEL: v_uitofp_i8_to_f32:
171 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
173 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
174 ; GFX10-NEXT: s_setpc_b64 s[30:31]
175 %cvt = uitofp i8 %arg0 to float
179 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
180 ; GCN-LABEL: v_uitofp_v2i8_to_v2f32:
182 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
184 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
185 ; GCN-NEXT: v_mov_b32_e32 v0, v2
186 ; GCN-NEXT: s_setpc_b64 s[30:31]
188 ; GFX10-LABEL: v_uitofp_v2i8_to_v2f32:
190 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
192 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
193 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
194 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
195 ; GFX10-NEXT: s_setpc_b64 s[30:31]
196 %val = bitcast i16 %arg0 to <2 x i8>
197 %cvt = uitofp <2 x i8> %val to <2 x float>
201 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
202 ; GCN-LABEL: v_uitofp_v3i8_to_v3f32:
204 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
206 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
207 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
208 ; GCN-NEXT: v_mov_b32_e32 v0, v3
209 ; GCN-NEXT: s_setpc_b64 s[30:31]
211 ; GFX10-LABEL: v_uitofp_v3i8_to_v3f32:
213 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
215 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
216 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
217 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
218 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
219 ; GFX10-NEXT: s_setpc_b64 s[30:31]
220 %trunc = trunc i32 %arg0 to i24
221 %val = bitcast i24 %trunc to <3 x i8>
222 %cvt = uitofp <3 x i8> %val to <3 x float>
226 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
227 ; GCN-LABEL: v_uitofp_v4i8_to_v4f32:
229 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
231 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
232 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
233 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
234 ; GCN-NEXT: v_mov_b32_e32 v0, v4
235 ; GCN-NEXT: s_setpc_b64 s[30:31]
237 ; GFX10-LABEL: v_uitofp_v4i8_to_v4f32:
239 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
241 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
242 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
243 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
244 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
245 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
246 ; GFX10-NEXT: s_setpc_b64 s[30:31]
247 %val = bitcast i32 %arg0 to <4 x i8>
248 %cvt = uitofp <4 x i8> %val to <4 x float>
252 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
253 ; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32:
255 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
257 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
258 ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
259 ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
260 ; GCN-NEXT: v_mov_b32_e32 v0, v4
261 ; GCN-NEXT: s_setpc_b64 s[30:31]
263 ; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32:
265 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
267 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
268 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
269 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
270 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
271 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
272 ; GFX10-NEXT: s_setpc_b64 s[30:31]
273 %mask.arg0 = and i32 %arg0, 255
274 %cvt0 = uitofp i32 %mask.arg0 to float
276 %lshr.8 = lshr i32 %arg0, 8
277 %mask.lshr.8 = and i32 %lshr.8, 255
278 %cvt1 = uitofp i32 %mask.lshr.8 to float
280 %lshr.16 = lshr i32 %arg0, 16
281 %mask.lshr.16 = and i32 %lshr.16, 255
282 %cvt2 = uitofp i32 %mask.lshr.16 to float
284 %lshr.24 = lshr i32 %arg0, 24
285 %mask.lshr.24 = and i32 %lshr.24, 255
286 %cvt3 = uitofp i32 %mask.lshr.24 to float
288 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
289 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
290 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
291 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
292 ret <4 x float> %ins.3
295 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
296 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
298 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
300 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
301 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
302 ; SI-NEXT: s_setpc_b64 s[30:31]
304 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
306 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
308 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
309 ; VI-NEXT: s_setpc_b64 s[30:31]
311 ; GFX10-LABEL: v_uitofp_i32_to_f16_mask255:
313 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
315 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
316 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
317 ; GFX10-NEXT: s_setpc_b64 s[30:31]
318 %masked = and i32 %arg0, 255
319 %cvt = uitofp i32 %masked to half
323 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
324 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
326 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
328 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
329 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
330 ; SI-NEXT: s_setpc_b64 s[30:31]
332 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
334 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
336 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
337 ; VI-NEXT: s_setpc_b64 s[30:31]
339 ; GFX10-LABEL: v_sitofp_i32_to_f16_mask255:
341 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
343 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
344 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
345 ; GFX10-NEXT: s_setpc_b64 s[30:31]
346 %masked = and i32 %arg0, 255
347 %cvt = sitofp i32 %masked to half
351 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
352 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
354 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
356 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
357 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
358 ; SI-NEXT: s_setpc_b64 s[30:31]
360 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
362 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
364 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
365 ; VI-NEXT: s_setpc_b64 s[30:31]
367 ; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255:
369 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
371 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
372 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
373 ; GFX10-NEXT: s_setpc_b64 s[30:31]
374 %lshr.8 = lshr i32 %arg0, 8
375 %masked = and i32 %lshr.8, 255
376 %cvt = uitofp i32 %masked to half
380 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
381 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
383 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
385 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
386 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
387 ; SI-NEXT: s_setpc_b64 s[30:31]
389 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
391 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
393 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
394 ; VI-NEXT: s_setpc_b64 s[30:31]
396 ; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255:
398 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
400 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
401 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
402 ; GFX10-NEXT: s_setpc_b64 s[30:31]
403 %lshr.16 = lshr i32 %arg0, 16
404 %masked = and i32 %lshr.16, 255
405 %cvt = uitofp i32 %masked to half
409 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
410 ; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
412 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
414 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
415 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
416 ; SI-NEXT: s_setpc_b64 s[30:31]
418 ; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
420 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
422 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
423 ; VI-NEXT: s_setpc_b64 s[30:31]
425 ; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255:
427 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
429 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
430 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
431 ; GFX10-NEXT: s_setpc_b64 s[30:31]
432 %lshr.16 = lshr i32 %arg0, 24
433 %masked = and i32 %lshr.16, 255
434 %cvt = uitofp i32 %masked to half
438 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
439 ; SI-LABEL: v_uitofp_i8_to_f16:
441 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
443 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
444 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
445 ; SI-NEXT: s_setpc_b64 s[30:31]
447 ; VI-LABEL: v_uitofp_i8_to_f16:
449 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450 ; VI-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
451 ; VI-NEXT: s_setpc_b64 s[30:31]
453 ; GFX10-LABEL: v_uitofp_i8_to_f16:
455 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
457 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
458 ; GFX10-NEXT: s_setpc_b64 s[30:31]
459 %cvt = uitofp i8 %arg0 to half
463 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
464 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
466 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
468 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
469 ; GCN-NEXT: s_setpc_b64 s[30:31]
471 ; GFX10-LABEL: v_uitofp_i32_to_f64_mask255:
473 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
475 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
476 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
477 ; GFX10-NEXT: s_setpc_b64 s[30:31]
478 %masked = and i32 %arg0, 255
479 %cvt = uitofp i32 %masked to double
483 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
484 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
486 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487 ; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8
488 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
489 ; GCN-NEXT: s_setpc_b64 s[30:31]
491 ; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255:
493 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
495 ; GFX10-NEXT: v_bfe_u32 v0, v0, 8, 8
496 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
497 ; GFX10-NEXT: s_setpc_b64 s[30:31]
498 %lshr.8 = lshr i32 %arg0, 8
499 %masked = and i32 %lshr.8, 255
500 %cvt = uitofp i32 %masked to double
504 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
505 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
507 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8
509 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
510 ; GCN-NEXT: s_setpc_b64 s[30:31]
512 ; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255:
514 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
516 ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8
517 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
518 ; GFX10-NEXT: s_setpc_b64 s[30:31]
519 %lshr.16 = lshr i32 %arg0, 16
520 %masked = and i32 %lshr.16, 255
521 %cvt = uitofp i32 %masked to double
525 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
526 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
528 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0
530 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
531 ; GCN-NEXT: s_setpc_b64 s[30:31]
533 ; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255:
535 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
537 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
538 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
539 ; GFX10-NEXT: s_setpc_b64 s[30:31]
540 %lshr.16 = lshr i32 %arg0, 24
541 %masked = and i32 %lshr.16, 255
542 %cvt = uitofp i32 %masked to double
546 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
547 ; SI-LABEL: v_uitofp_i8_to_f64:
549 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
551 ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
552 ; SI-NEXT: s_setpc_b64 s[30:31]
554 ; VI-LABEL: v_uitofp_i8_to_f64:
556 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
558 ; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
559 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
560 ; VI-NEXT: s_setpc_b64 s[30:31]
562 ; GFX10-LABEL: v_uitofp_i8_to_f64:
564 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
566 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff
567 ; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
568 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
569 ; GFX10-NEXT: s_setpc_b64 s[30:31]
570 %cvt = uitofp i8 %arg0 to double
574 define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
575 ; SI-LABEL: load_i8_to_f32:
577 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
578 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
579 ; SI-NEXT: s_mov_b32 s7, 0xf000
580 ; SI-NEXT: v_mov_b32_e32 v1, 0
581 ; SI-NEXT: s_mov_b32 s2, 0
582 ; SI-NEXT: s_mov_b32 s3, s7
583 ; SI-NEXT: s_waitcnt lgkmcnt(0)
584 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
585 ; SI-NEXT: s_mov_b32 s6, -1
586 ; SI-NEXT: s_waitcnt vmcnt(0)
587 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
588 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
591 ; VI-LABEL: load_i8_to_f32:
593 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
594 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
595 ; VI-NEXT: s_mov_b32 s7, 0xf000
596 ; VI-NEXT: s_mov_b32 s6, -1
597 ; VI-NEXT: s_waitcnt lgkmcnt(0)
598 ; VI-NEXT: v_mov_b32_e32 v1, s1
599 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
600 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
601 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
602 ; VI-NEXT: s_waitcnt vmcnt(0)
603 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
604 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
607 ; GFX10-LABEL: load_i8_to_f32:
609 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
610 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
611 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
612 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
613 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
614 ; GFX10-NEXT: s_waitcnt vmcnt(0)
615 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
616 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
617 ; GFX10-NEXT: s_endpgm
618 %tid = call i32 @llvm.amdgcn.workitem.id.x()
619 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
620 %load = load i8, i8 addrspace(1)* %gep, align 1
621 %cvt = uitofp i8 %load to float
622 store float %cvt, float addrspace(1)* %out, align 4
626 define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
627 ; SI-LABEL: load_v2i8_to_v2f32:
629 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
630 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
631 ; SI-NEXT: s_mov_b32 s7, 0xf000
632 ; SI-NEXT: s_mov_b32 s2, 0
633 ; SI-NEXT: s_mov_b32 s3, s7
634 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
635 ; SI-NEXT: v_mov_b32_e32 v1, 0
636 ; SI-NEXT: s_waitcnt lgkmcnt(0)
637 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
638 ; SI-NEXT: s_mov_b32 s6, -1
639 ; SI-NEXT: s_waitcnt vmcnt(0)
640 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
641 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
642 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
645 ; VI-LABEL: load_v2i8_to_v2f32:
647 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
648 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
649 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
650 ; VI-NEXT: s_mov_b32 s7, 0xf000
651 ; VI-NEXT: s_mov_b32 s6, -1
652 ; VI-NEXT: s_waitcnt lgkmcnt(0)
653 ; VI-NEXT: v_mov_b32_e32 v1, s1
654 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
655 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
656 ; VI-NEXT: flat_load_ushort v0, v[0:1]
657 ; VI-NEXT: s_waitcnt vmcnt(0)
658 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
659 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
660 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
663 ; GFX10-LABEL: load_v2i8_to_v2f32:
665 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
666 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
667 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
668 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
669 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
671 ; GFX10-NEXT: s_waitcnt vmcnt(0)
672 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
673 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
674 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
675 ; GFX10-NEXT: s_endpgm
676 %tid = call i32 @llvm.amdgcn.workitem.id.x()
677 %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
678 %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
679 %cvt = uitofp <2 x i8> %load to <2 x float>
680 store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
684 define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
685 ; SI-LABEL: load_v3i8_to_v3f32:
687 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
688 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
689 ; SI-NEXT: s_mov_b32 s7, 0xf000
690 ; SI-NEXT: s_mov_b32 s2, 0
691 ; SI-NEXT: s_mov_b32 s3, s7
692 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
693 ; SI-NEXT: v_mov_b32_e32 v1, 0
694 ; SI-NEXT: s_waitcnt lgkmcnt(0)
695 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
696 ; SI-NEXT: s_mov_b32 s6, -1
697 ; SI-NEXT: s_waitcnt vmcnt(0)
698 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2
699 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
700 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
701 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
702 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
705 ; VI-LABEL: load_v3i8_to_v3f32:
707 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
708 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
709 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
710 ; VI-NEXT: s_mov_b32 s7, 0xf000
711 ; VI-NEXT: s_mov_b32 s6, -1
712 ; VI-NEXT: s_waitcnt lgkmcnt(0)
713 ; VI-NEXT: v_mov_b32_e32 v1, s1
714 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
715 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
716 ; VI-NEXT: flat_load_dword v0, v[0:1]
717 ; VI-NEXT: s_waitcnt vmcnt(0)
718 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
719 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
720 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
721 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
724 ; GFX10-LABEL: load_v3i8_to_v3f32:
726 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
727 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
728 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
729 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
730 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
731 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
732 ; GFX10-NEXT: s_waitcnt vmcnt(0)
733 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
734 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
735 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
736 ; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
737 ; GFX10-NEXT: s_endpgm
738 %tid = call i32 @llvm.amdgcn.workitem.id.x()
739 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
740 %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
741 %cvt = uitofp <3 x i8> %load to <3 x float>
742 store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
746 define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
747 ; SI-LABEL: load_v4i8_to_v4f32:
749 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
750 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
751 ; SI-NEXT: s_mov_b32 s7, 0xf000
752 ; SI-NEXT: s_mov_b32 s2, 0
753 ; SI-NEXT: s_mov_b32 s3, s7
754 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
755 ; SI-NEXT: v_mov_b32_e32 v1, 0
756 ; SI-NEXT: s_waitcnt lgkmcnt(0)
757 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
758 ; SI-NEXT: s_mov_b32 s6, -1
759 ; SI-NEXT: s_waitcnt vmcnt(0)
760 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
761 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
762 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
763 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
764 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
767 ; VI-LABEL: load_v4i8_to_v4f32:
769 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
770 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
771 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
772 ; VI-NEXT: s_mov_b32 s7, 0xf000
773 ; VI-NEXT: s_mov_b32 s6, -1
774 ; VI-NEXT: s_waitcnt lgkmcnt(0)
775 ; VI-NEXT: v_mov_b32_e32 v1, s1
776 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
777 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
778 ; VI-NEXT: flat_load_dword v0, v[0:1]
779 ; VI-NEXT: s_waitcnt vmcnt(0)
780 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
781 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
782 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
783 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
784 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
787 ; GFX10-LABEL: load_v4i8_to_v4f32:
789 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
790 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
791 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
792 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
793 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
794 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
795 ; GFX10-NEXT: s_waitcnt vmcnt(0)
796 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
797 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
798 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
799 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
800 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
801 ; GFX10-NEXT: s_endpgm
802 %tid = call i32 @llvm.amdgcn.workitem.id.x()
803 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
804 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
805 %cvt = uitofp <4 x i8> %load to <4 x float>
806 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
810 ; This should not be adding instructions to shift into the correct
811 ; position in the word for the component.
813 ; FIXME: Packing bytes
814 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
815 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
817 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
818 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
819 ; SI-NEXT: s_mov_b32 s7, 0xf000
820 ; SI-NEXT: s_mov_b32 s2, 0
821 ; SI-NEXT: s_mov_b32 s3, s7
822 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
823 ; SI-NEXT: v_mov_b32_e32 v1, 0
824 ; SI-NEXT: s_waitcnt lgkmcnt(0)
825 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
826 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
827 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
828 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
829 ; SI-NEXT: s_mov_b32 s6, -1
830 ; SI-NEXT: s_waitcnt vmcnt(2)
831 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2
832 ; SI-NEXT: s_waitcnt vmcnt(0)
833 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
834 ; SI-NEXT: v_or_b32_e32 v0, v0, v3
835 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
836 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
837 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
838 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
839 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
842 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
844 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
845 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
846 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
847 ; VI-NEXT: s_mov_b32 s7, 0xf000
848 ; VI-NEXT: s_mov_b32 s6, -1
849 ; VI-NEXT: s_waitcnt lgkmcnt(0)
850 ; VI-NEXT: v_mov_b32_e32 v1, s1
851 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
852 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
853 ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
854 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
855 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
856 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
857 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
858 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
859 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
860 ; VI-NEXT: flat_load_ubyte v5, v[6:7]
861 ; VI-NEXT: flat_load_ubyte v6, v[2:3]
862 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
863 ; VI-NEXT: s_waitcnt vmcnt(3)
864 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
865 ; VI-NEXT: s_waitcnt vmcnt(2)
866 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
867 ; VI-NEXT: s_waitcnt vmcnt(1)
868 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
869 ; VI-NEXT: s_waitcnt vmcnt(0)
870 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
871 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
874 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
876 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
877 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
878 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
879 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
880 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
881 ; GFX10-NEXT: s_clause 0x3
882 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
883 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
884 ; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
885 ; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3]
886 ; GFX10-NEXT: s_waitcnt vmcnt(3)
887 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
888 ; GFX10-NEXT: s_waitcnt vmcnt(2)
889 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
890 ; GFX10-NEXT: s_waitcnt vmcnt(1)
891 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
892 ; GFX10-NEXT: s_waitcnt vmcnt(0)
893 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
894 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
895 ; GFX10-NEXT: s_endpgm
896 %tid = call i32 @llvm.amdgcn.workitem.id.x()
897 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
898 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
899 %cvt = uitofp <4 x i8> %load to <4 x float>
900 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
904 ; FIXME: Need to handle non-uniform case for function below (load without gep).
905 ; Instructions still emitted to repack bytes for add use.
906 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
907 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
909 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
910 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
911 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
912 ; SI-NEXT: s_mov_b32 s11, 0xf000
913 ; SI-NEXT: s_mov_b32 s2, 0
914 ; SI-NEXT: s_mov_b32 s3, s11
915 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
916 ; SI-NEXT: v_mov_b32_e32 v1, 0
917 ; SI-NEXT: s_waitcnt lgkmcnt(0)
918 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
919 ; SI-NEXT: s_mov_b32 s10, -1
920 ; SI-NEXT: s_movk_i32 s0, 0xff
921 ; SI-NEXT: s_mov_b32 s6, s10
922 ; SI-NEXT: s_mov_b32 s7, s11
923 ; SI-NEXT: s_waitcnt vmcnt(0)
924 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
925 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
926 ; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4
927 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
928 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
929 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
930 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
931 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
932 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
933 ; SI-NEXT: s_waitcnt expcnt(0)
934 ; SI-NEXT: v_and_b32_e32 v0, s0, v4
935 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
936 ; SI-NEXT: v_or_b32_e32 v0, v7, v0
937 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
938 ; SI-NEXT: v_and_b32_e32 v2, s0, v2
939 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
940 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
941 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
942 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
943 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
944 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
945 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
948 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
950 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
951 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
952 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
953 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
954 ; VI-NEXT: s_mov_b32 s11, 0xf000
955 ; VI-NEXT: s_mov_b32 s10, -1
956 ; VI-NEXT: v_mov_b32_e32 v5, 9
957 ; VI-NEXT: s_waitcnt lgkmcnt(0)
958 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
959 ; VI-NEXT: v_mov_b32_e32 v1, s1
960 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
961 ; VI-NEXT: flat_load_dword v4, v[0:1]
962 ; VI-NEXT: s_mov_b32 s6, s10
963 ; VI-NEXT: s_mov_b32 s7, s11
964 ; VI-NEXT: s_movk_i32 s0, 0x900
965 ; VI-NEXT: s_waitcnt vmcnt(0)
966 ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
967 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
968 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
969 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
970 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
971 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
972 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
973 ; VI-NEXT: v_add_u16_e32 v8, 9, v4
974 ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
975 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
976 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
977 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
978 ; VI-NEXT: v_mov_b32_e32 v2, s0
979 ; VI-NEXT: v_add_u16_e32 v0, s0, v0
980 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
981 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
982 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
985 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
987 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
988 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
989 ; GFX10-NEXT: v_mov_b32_e32 v1, 24
990 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
991 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
992 ; GFX10-NEXT: s_clause 0x1
993 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
994 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
995 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
996 ; GFX10-NEXT: s_movk_i32 s0, 0x900
997 ; GFX10-NEXT: s_waitcnt vmcnt(0)
998 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
999 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1000 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0
1001 ; GFX10-NEXT: v_add_nc_u16 v4, v0, 9
1002 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 9
1003 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1004 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1005 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1006 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1007 ; GFX10-NEXT: v_add_nc_u16 v1, v1, s0
1008 ; GFX10-NEXT: v_add_nc_u16 v5, v2, s0
1009 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1010 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
1011 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1012 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1013 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1014 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1015 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
1016 ; GFX10-NEXT: global_store_dword v4, v5, s[4:5]
1017 ; GFX10-NEXT: s_endpgm
1018 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
1019 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
1020 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
1021 %cvt = uitofp <4 x i8> %load to <4 x float>
1022 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
1023 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
1024 store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
1028 ; Make sure this doesn't crash.
1029 define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
1030 ; SI-LABEL: load_v7i8_to_v7f32:
1032 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1033 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1034 ; SI-NEXT: s_mov_b32 s7, 0xf000
1035 ; SI-NEXT: s_mov_b32 s2, 0
1036 ; SI-NEXT: s_mov_b32 s3, s7
1037 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1038 ; SI-NEXT: v_mov_b32_e32 v1, 0
1039 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1040 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
1041 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
1042 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
1043 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
1044 ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
1045 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
1046 ; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
1047 ; SI-NEXT: s_mov_b32 s6, -1
1048 ; SI-NEXT: s_waitcnt vmcnt(6)
1049 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1050 ; SI-NEXT: s_waitcnt vmcnt(5)
1051 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
1052 ; SI-NEXT: s_waitcnt vmcnt(3)
1053 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
1054 ; SI-NEXT: v_or_b32_e32 v3, v9, v6
1055 ; SI-NEXT: s_waitcnt vmcnt(1)
1056 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
1057 ; SI-NEXT: s_waitcnt vmcnt(0)
1058 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
1059 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24
1060 ; SI-NEXT: s_waitcnt expcnt(0)
1061 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1062 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
1063 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
1064 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
1065 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
1066 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1069 ; VI-LABEL: load_v7i8_to_v7f32:
1071 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1072 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1073 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1074 ; VI-NEXT: s_mov_b32 s7, 0xf000
1075 ; VI-NEXT: s_mov_b32 s6, -1
1076 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1077 ; VI-NEXT: v_mov_b32_e32 v1, s1
1078 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1079 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1080 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0
1081 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1082 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
1083 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1084 ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
1085 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1086 ; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0
1087 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1088 ; VI-NEXT: flat_load_ubyte v10, v[4:5]
1089 ; VI-NEXT: flat_load_ubyte v11, v[6:7]
1090 ; VI-NEXT: flat_load_ubyte v8, v[8:9]
1091 ; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
1092 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1093 ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
1094 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1095 ; VI-NEXT: flat_load_ubyte v6, v[6:7]
1096 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
1097 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1098 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1099 ; VI-NEXT: s_waitcnt vmcnt(4)
1100 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
1101 ; VI-NEXT: s_waitcnt vmcnt(3)
1102 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
1103 ; VI-NEXT: s_waitcnt vmcnt(2)
1104 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1105 ; VI-NEXT: s_waitcnt vmcnt(1)
1106 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1107 ; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1108 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11
1109 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
1110 ; VI-NEXT: s_waitcnt vmcnt(0)
1111 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1112 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
1113 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
1114 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1117 ; GFX10-LABEL: load_v7i8_to_v7f32:
1119 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1120 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1121 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1122 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1123 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
1124 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1125 ; GFX10-NEXT: s_clause 0x5
1126 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2
1127 ; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
1128 ; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
1129 ; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
1130 ; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
1131 ; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3]
1132 ; GFX10-NEXT: s_waitcnt vmcnt(4)
1133 ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1
1134 ; GFX10-NEXT: s_waitcnt vmcnt(2)
1135 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1136 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1137 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
1138 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
1139 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1140 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
1141 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1142 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1143 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1144 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1145 ; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
1146 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
1147 ; GFX10-NEXT: s_endpgm
1148 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1149 %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
1150 %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
1151 %cvt = uitofp <7 x i8> %load to <7 x float>
1152 store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
1156 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
1157 ; SI-LABEL: load_v8i8_to_v8f32:
1159 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1160 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1161 ; SI-NEXT: s_mov_b32 s7, 0xf000
1162 ; SI-NEXT: s_mov_b32 s2, 0
1163 ; SI-NEXT: s_mov_b32 s3, s7
1164 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1165 ; SI-NEXT: v_mov_b32_e32 v1, 0
1166 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1167 ; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64
1168 ; SI-NEXT: s_mov_b32 s6, -1
1169 ; SI-NEXT: s_waitcnt vmcnt(0)
1170 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
1171 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
1172 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
1173 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1174 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
1175 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
1176 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
1177 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
1178 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1179 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1182 ; VI-LABEL: load_v8i8_to_v8f32:
1184 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1185 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1186 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1187 ; VI-NEXT: s_mov_b32 s7, 0xf000
1188 ; VI-NEXT: s_mov_b32 s6, -1
1189 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1190 ; VI-NEXT: v_mov_b32_e32 v1, s1
1191 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1192 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1193 ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1]
1194 ; VI-NEXT: s_waitcnt vmcnt(0)
1195 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
1196 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
1197 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
1198 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1199 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
1200 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
1201 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
1202 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
1203 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1204 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1207 ; GFX10-LABEL: load_v8i8_to_v8f32:
1209 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1210 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1211 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1212 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1214 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3]
1215 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1216 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9
1217 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9
1218 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9
1219 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
1220 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8
1221 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8
1222 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8
1223 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8
1224 ; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
1225 ; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
1226 ; GFX10-NEXT: s_endpgm
1227 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1228 %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
1229 %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
1230 %cvt = uitofp <8 x i8> %load to <8 x float>
1231 store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
1235 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1236 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
1238 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1239 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1240 ; SI-NEXT: s_mov_b32 s7, 0xf000
1241 ; SI-NEXT: s_mov_b32 s2, 0
1242 ; SI-NEXT: s_mov_b32 s3, s7
1243 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1244 ; SI-NEXT: v_mov_b32_e32 v1, 0
1245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1246 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1247 ; SI-NEXT: s_mov_b32 s6, -1
1248 ; SI-NEXT: s_waitcnt vmcnt(0)
1249 ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
1250 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1251 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1254 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
1256 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1257 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1258 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1259 ; VI-NEXT: s_mov_b32 s7, 0xf000
1260 ; VI-NEXT: s_mov_b32 s6, -1
1261 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1262 ; VI-NEXT: v_mov_b32_e32 v1, s1
1263 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1264 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1265 ; VI-NEXT: flat_load_dword v0, v[0:1]
1266 ; VI-NEXT: s_waitcnt vmcnt(0)
1267 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
1268 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1269 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1272 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
1274 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1275 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1276 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1277 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1278 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1279 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1280 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1281 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
1282 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1283 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1284 ; GFX10-NEXT: s_endpgm
1285 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1286 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1287 %load = load i32, i32 addrspace(1)* %gep, align 4
1288 %add = add i32 %load, 2
1289 %inreg = and i32 %add, 255
1290 %cvt = uitofp i32 %inreg to float
1291 store float %cvt, float addrspace(1)* %out, align 4
1295 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1296 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
1298 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1299 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1300 ; SI-NEXT: s_mov_b32 s7, 0xf000
1301 ; SI-NEXT: s_mov_b32 s2, 0
1302 ; SI-NEXT: s_mov_b32 s3, s7
1303 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1304 ; SI-NEXT: v_mov_b32_e32 v1, 0
1305 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1306 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1307 ; SI-NEXT: s_mov_b32 s6, -1
1308 ; SI-NEXT: s_waitcnt vmcnt(0)
1309 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
1310 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1313 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
1315 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1316 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1317 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1318 ; VI-NEXT: s_mov_b32 s7, 0xf000
1319 ; VI-NEXT: s_mov_b32 s6, -1
1320 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1321 ; VI-NEXT: v_mov_b32_e32 v1, s1
1322 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1323 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1324 ; VI-NEXT: flat_load_dword v0, v[0:1]
1325 ; VI-NEXT: s_waitcnt vmcnt(0)
1326 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
1327 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1330 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
1332 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1333 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1334 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1335 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1336 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1337 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1338 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1339 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
1340 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1341 ; GFX10-NEXT: s_endpgm
1342 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1343 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1344 %load = load i32, i32 addrspace(1)* %gep, align 4
1345 %inreg = and i32 %load, 65280
1346 %shr = lshr i32 %inreg, 8
1347 %cvt = uitofp i32 %shr to float
1348 store float %cvt, float addrspace(1)* %out, align 4
1352 ; We don't get these ones because of the zext, but instcombine removes
1353 ; them so it shouldn't really matter.
1354 define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
1355 ; SI-LABEL: i8_zext_i32_to_f32:
1357 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1358 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1359 ; SI-NEXT: s_mov_b32 s7, 0xf000
1360 ; SI-NEXT: v_mov_b32_e32 v1, 0
1361 ; SI-NEXT: s_mov_b32 s2, 0
1362 ; SI-NEXT: s_mov_b32 s3, s7
1363 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1364 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
1365 ; SI-NEXT: s_mov_b32 s6, -1
1366 ; SI-NEXT: s_waitcnt vmcnt(0)
1367 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1368 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1371 ; VI-LABEL: i8_zext_i32_to_f32:
1373 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1374 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1375 ; VI-NEXT: s_mov_b32 s7, 0xf000
1376 ; VI-NEXT: s_mov_b32 s6, -1
1377 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1378 ; VI-NEXT: v_mov_b32_e32 v1, s1
1379 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1380 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1381 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1382 ; VI-NEXT: s_waitcnt vmcnt(0)
1383 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1384 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1387 ; GFX10-LABEL: i8_zext_i32_to_f32:
1389 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1390 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1391 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1392 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1393 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1394 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1395 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1396 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1397 ; GFX10-NEXT: s_endpgm
1398 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1399 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
1400 %load = load i8, i8 addrspace(1)* %gep, align 1
1401 %ext = zext i8 %load to i32
1402 %cvt = uitofp i32 %ext to float
1403 store float %cvt, float addrspace(1)* %out, align 4
1407 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
1408 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
1410 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1411 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1412 ; SI-NEXT: s_mov_b32 s7, 0xf000
1413 ; SI-NEXT: s_mov_b32 s2, 0
1414 ; SI-NEXT: s_mov_b32 s3, s7
1415 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1416 ; SI-NEXT: v_mov_b32_e32 v1, 0
1417 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1418 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
1419 ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
1420 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
1421 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
1422 ; SI-NEXT: s_mov_b32 s6, -1
1423 ; SI-NEXT: s_waitcnt vmcnt(2)
1424 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2
1425 ; SI-NEXT: s_waitcnt vmcnt(0)
1426 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1427 ; SI-NEXT: v_or_b32_e32 v0, v0, v3
1428 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1429 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1430 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1431 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1432 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1435 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
1437 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1438 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1439 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1440 ; VI-NEXT: s_mov_b32 s7, 0xf000
1441 ; VI-NEXT: s_mov_b32 s6, -1
1442 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1443 ; VI-NEXT: v_mov_b32_e32 v1, s1
1444 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1445 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1446 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0
1447 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1448 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
1449 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1450 ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
1451 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1452 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
1453 ; VI-NEXT: flat_load_ubyte v5, v[6:7]
1454 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1455 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1456 ; VI-NEXT: s_waitcnt vmcnt(1)
1457 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1458 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1459 ; VI-NEXT: s_waitcnt vmcnt(0)
1460 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1461 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
1462 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1
1463 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
1464 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1467 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
1469 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1470 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1471 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1472 ; GFX10-NEXT: v_mov_b32_e32 v5, 0
1473 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1474 ; GFX10-NEXT: s_clause 0x3
1475 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
1476 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
1477 ; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1
1478 ; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3]
1479 ; GFX10-NEXT: s_waitcnt vmcnt(2)
1480 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8, v2
1481 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1482 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
1483 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1484 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1485 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1486 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1487 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1488 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
1489 ; GFX10-NEXT: s_endpgm
1490 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1491 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
1492 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
1493 %ext = zext <4 x i8> %load to <4 x i32>
1494 %cvt = uitofp <4 x i32> %ext to <4 x float>
1495 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
1499 define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1500 ; SI-LABEL: extract_byte0_to_f32:
1502 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1503 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1504 ; SI-NEXT: s_mov_b32 s7, 0xf000
1505 ; SI-NEXT: s_mov_b32 s2, 0
1506 ; SI-NEXT: s_mov_b32 s3, s7
1507 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1508 ; SI-NEXT: v_mov_b32_e32 v1, 0
1509 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1510 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1511 ; SI-NEXT: s_mov_b32 s6, -1
1512 ; SI-NEXT: s_waitcnt vmcnt(0)
1513 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1514 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1517 ; VI-LABEL: extract_byte0_to_f32:
1519 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1520 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1521 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1522 ; VI-NEXT: s_mov_b32 s7, 0xf000
1523 ; VI-NEXT: s_mov_b32 s6, -1
1524 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1525 ; VI-NEXT: v_mov_b32_e32 v1, s1
1526 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1527 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1528 ; VI-NEXT: flat_load_dword v0, v[0:1]
1529 ; VI-NEXT: s_waitcnt vmcnt(0)
1530 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1531 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1534 ; GFX10-LABEL: extract_byte0_to_f32:
1536 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1537 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1538 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1539 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1540 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1541 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1542 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1543 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1544 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1545 ; GFX10-NEXT: s_endpgm
1546 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1547 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1548 %val = load i32, i32 addrspace(1)* %gep
1549 %and = and i32 %val, 255
1550 %cvt = uitofp i32 %and to float
1551 store float %cvt, float addrspace(1)* %out
1555 define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1556 ; SI-LABEL: extract_byte1_to_f32:
1558 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1559 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1560 ; SI-NEXT: s_mov_b32 s7, 0xf000
1561 ; SI-NEXT: s_mov_b32 s2, 0
1562 ; SI-NEXT: s_mov_b32 s3, s7
1563 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1564 ; SI-NEXT: v_mov_b32_e32 v1, 0
1565 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1566 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1567 ; SI-NEXT: s_mov_b32 s6, -1
1568 ; SI-NEXT: s_waitcnt vmcnt(0)
1569 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
1570 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1573 ; VI-LABEL: extract_byte1_to_f32:
1575 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1576 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1577 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1578 ; VI-NEXT: s_mov_b32 s7, 0xf000
1579 ; VI-NEXT: s_mov_b32 s6, -1
1580 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1581 ; VI-NEXT: v_mov_b32_e32 v1, s1
1582 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1583 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1584 ; VI-NEXT: flat_load_dword v0, v[0:1]
1585 ; VI-NEXT: s_waitcnt vmcnt(0)
1586 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
1587 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1590 ; GFX10-LABEL: extract_byte1_to_f32:
1592 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1593 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1594 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1595 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1596 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1597 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1598 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1599 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
1600 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1601 ; GFX10-NEXT: s_endpgm
1602 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1603 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1604 %val = load i32, i32 addrspace(1)* %gep
1605 %srl = lshr i32 %val, 8
1606 %and = and i32 %srl, 255
1607 %cvt = uitofp i32 %and to float
1608 store float %cvt, float addrspace(1)* %out
1612 define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1613 ; SI-LABEL: extract_byte2_to_f32:
1615 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1616 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1617 ; SI-NEXT: s_mov_b32 s7, 0xf000
1618 ; SI-NEXT: s_mov_b32 s2, 0
1619 ; SI-NEXT: s_mov_b32 s3, s7
1620 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1621 ; SI-NEXT: v_mov_b32_e32 v1, 0
1622 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1623 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1624 ; SI-NEXT: s_mov_b32 s6, -1
1625 ; SI-NEXT: s_waitcnt vmcnt(0)
1626 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
1627 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1630 ; VI-LABEL: extract_byte2_to_f32:
1632 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1633 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1634 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1635 ; VI-NEXT: s_mov_b32 s7, 0xf000
1636 ; VI-NEXT: s_mov_b32 s6, -1
1637 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1638 ; VI-NEXT: v_mov_b32_e32 v1, s1
1639 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1640 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1641 ; VI-NEXT: flat_load_dword v0, v[0:1]
1642 ; VI-NEXT: s_waitcnt vmcnt(0)
1643 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
1644 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1647 ; GFX10-LABEL: extract_byte2_to_f32:
1649 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1650 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1651 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1652 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1653 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1654 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1655 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1656 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
1657 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1658 ; GFX10-NEXT: s_endpgm
1659 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1660 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1661 %val = load i32, i32 addrspace(1)* %gep
1662 %srl = lshr i32 %val, 16
1663 %and = and i32 %srl, 255
1664 %cvt = uitofp i32 %and to float
1665 store float %cvt, float addrspace(1)* %out
1669 define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1670 ; SI-LABEL: extract_byte3_to_f32:
1672 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1673 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
1674 ; SI-NEXT: s_mov_b32 s7, 0xf000
1675 ; SI-NEXT: s_mov_b32 s2, 0
1676 ; SI-NEXT: s_mov_b32 s3, s7
1677 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1678 ; SI-NEXT: v_mov_b32_e32 v1, 0
1679 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1680 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1681 ; SI-NEXT: s_mov_b32 s6, -1
1682 ; SI-NEXT: s_waitcnt vmcnt(0)
1683 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
1684 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1687 ; VI-LABEL: extract_byte3_to_f32:
1689 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1690 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1691 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1692 ; VI-NEXT: s_mov_b32 s7, 0xf000
1693 ; VI-NEXT: s_mov_b32 s6, -1
1694 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1695 ; VI-NEXT: v_mov_b32_e32 v1, s1
1696 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1697 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1698 ; VI-NEXT: flat_load_dword v0, v[0:1]
1699 ; VI-NEXT: s_waitcnt vmcnt(0)
1700 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
1701 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1704 ; GFX10-LABEL: extract_byte3_to_f32:
1706 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1707 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1708 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1709 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1710 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1711 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1712 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1713 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
1714 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1715 ; GFX10-NEXT: s_endpgm
1716 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1717 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1718 %val = load i32, i32 addrspace(1)* %gep
1719 %srl = lshr i32 %val, 24
1720 %and = and i32 %srl, 255
1721 %cvt = uitofp i32 %and to float
1722 store float %cvt, float addrspace(1)* %out
1726 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
1727 ; SI-LABEL: cvt_ubyte0_or_multiuse:
1728 ; SI: ; %bb.0: ; %bb
1729 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1730 ; SI-NEXT: s_mov_b32 s3, 0xf000
1731 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1732 ; SI-NEXT: v_mov_b32_e32 v1, 0
1733 ; SI-NEXT: s_mov_b32 s2, -1
1734 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1735 ; SI-NEXT: s_mov_b32 s0, s6
1736 ; SI-NEXT: s_mov_b32 s1, s7
1737 ; SI-NEXT: s_mov_b32 s6, 0
1738 ; SI-NEXT: s_mov_b32 s7, s3
1739 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1740 ; SI-NEXT: s_waitcnt vmcnt(0)
1741 ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1742 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
1743 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
1744 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1747 ; VI-LABEL: cvt_ubyte0_or_multiuse:
1748 ; VI: ; %bb.0: ; %bb
1749 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1750 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1751 ; VI-NEXT: s_mov_b32 s3, 0xf000
1752 ; VI-NEXT: s_mov_b32 s2, -1
1753 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1754 ; VI-NEXT: v_mov_b32_e32 v1, s5
1755 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
1756 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1757 ; VI-NEXT: flat_load_dword v0, v[0:1]
1758 ; VI-NEXT: s_mov_b32 s0, s6
1759 ; VI-NEXT: s_mov_b32 s1, s7
1760 ; VI-NEXT: s_waitcnt vmcnt(0)
1761 ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1762 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
1763 ; VI-NEXT: v_add_f32_e32 v0, v0, v1
1764 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1767 ; GFX10-LABEL: cvt_ubyte0_or_multiuse:
1768 ; GFX10: ; %bb.0: ; %bb
1769 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1770 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1771 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1772 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1773 ; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
1774 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1775 ; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0
1776 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
1777 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
1778 ; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
1779 ; GFX10-NEXT: s_endpgm
1781 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1782 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
1783 %load = load i32, i32 addrspace(1)* %gep
1784 %or = or i32 %load, -2147483647
1785 %and = and i32 %or, 255
1786 %uitofp = uitofp i32 %and to float
1787 %cast = bitcast i32 %or to float
1788 %add = fadd float %cast, %uitofp
1789 store float %add, float addrspace(1)* %out