1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
6 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
7 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
9 define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
10 ; SI-LABEL: test_umul24_i32:
11 ; SI: ; %bb.0: ; %entry
12 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
13 ; SI-NEXT: s_mov_b32 s7, 0xf000
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_and_b32 s2, s2, 0xffffff
16 ; SI-NEXT: s_and_b32 s3, s3, 0xffffff
17 ; SI-NEXT: s_mul_i32 s2, s2, s3
18 ; SI-NEXT: s_mov_b32 s6, -1
19 ; SI-NEXT: s_mov_b32 s4, s0
20 ; SI-NEXT: s_mov_b32 s5, s1
21 ; SI-NEXT: v_mov_b32_e32 v0, s2
22 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
25 ; VI-LABEL: test_umul24_i32:
26 ; VI: ; %bb.0: ; %entry
27 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
28 ; VI-NEXT: s_mov_b32 s7, 0xf000
29 ; VI-NEXT: s_mov_b32 s6, -1
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: s_mov_b32 s4, s0
32 ; VI-NEXT: s_mov_b32 s5, s1
33 ; VI-NEXT: s_and_b32 s0, s2, 0xffffff
34 ; VI-NEXT: s_and_b32 s1, s3, 0xffffff
35 ; VI-NEXT: s_mul_i32 s0, s0, s1
36 ; VI-NEXT: v_mov_b32_e32 v0, s0
37 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
40 ; GFX9-LABEL: test_umul24_i32:
41 ; GFX9: ; %bb.0: ; %entry
42 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
43 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
44 ; GFX9-NEXT: s_mov_b32 s6, -1
45 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX9-NEXT: s_mov_b32 s4, s0
47 ; GFX9-NEXT: s_mov_b32 s5, s1
48 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
49 ; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
50 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
51 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
52 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
56 %a_24 = lshr i32 %0, 8
58 %b_24 = lshr i32 %1, 8
59 %2 = mul i32 %a_24, %b_24
60 store i32 %2, ptr addrspace(1) %out
64 define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) {
65 ; SI-LABEL: test_umul24_i16_sext:
66 ; SI: ; %bb.0: ; %entry
67 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
68 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
69 ; SI-NEXT: s_mov_b32 s3, 0xf000
70 ; SI-NEXT: s_waitcnt lgkmcnt(0)
71 ; SI-NEXT: s_lshr_b32 s4, s2, 16
72 ; SI-NEXT: s_mul_i32 s2, s2, s4
73 ; SI-NEXT: s_sext_i32_i16 s4, s2
74 ; SI-NEXT: s_mov_b32 s2, -1
75 ; SI-NEXT: v_mov_b32_e32 v0, s4
76 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
79 ; VI-LABEL: test_umul24_i16_sext:
80 ; VI: ; %bb.0: ; %entry
81 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
83 ; VI-NEXT: s_mov_b32 s3, 0xf000
84 ; VI-NEXT: s_mov_b32 s2, -1
85 ; VI-NEXT: s_waitcnt lgkmcnt(0)
86 ; VI-NEXT: s_lshr_b32 s5, s4, 16
87 ; VI-NEXT: s_mul_i32 s4, s4, s5
88 ; VI-NEXT: s_sext_i32_i16 s4, s4
89 ; VI-NEXT: v_mov_b32_e32 v0, s4
90 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
93 ; GFX9-LABEL: test_umul24_i16_sext:
94 ; GFX9: ; %bb.0: ; %entry
95 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
96 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
97 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
98 ; GFX9-NEXT: s_mov_b32 s6, -1
99 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16
101 ; GFX9-NEXT: s_mul_i32 s2, s2, s0
102 ; GFX9-NEXT: s_sext_i32_i16 s0, s2
103 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
104 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
105 ; GFX9-NEXT: s_endpgm
107 %mul = mul i16 %a, %b
108 %ext = sext i16 %mul to i32
109 store i32 %ext, ptr addrspace(1) %out
113 define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) {
114 ; SI-LABEL: test_umul24_i16_vgpr_sext:
116 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
117 ; SI-NEXT: s_mov_b32 s7, 0xf000
118 ; SI-NEXT: s_mov_b32 s10, 0
119 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
120 ; SI-NEXT: v_mov_b32_e32 v3, 0
121 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
122 ; SI-NEXT: s_mov_b32 s11, s7
123 ; SI-NEXT: v_mov_b32_e32 v1, v3
124 ; SI-NEXT: s_waitcnt lgkmcnt(0)
125 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
126 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
127 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
128 ; SI-NEXT: s_mov_b32 s6, -1
129 ; SI-NEXT: s_mov_b32 s4, s0
130 ; SI-NEXT: s_mov_b32 s5, s1
131 ; SI-NEXT: s_waitcnt vmcnt(0)
132 ; SI-NEXT: v_mul_u32_u24_e32 v0, v2, v0
133 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
134 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
137 ; VI-LABEL: test_umul24_i16_vgpr_sext:
139 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
140 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
141 ; VI-NEXT: s_waitcnt lgkmcnt(0)
142 ; VI-NEXT: v_mov_b32_e32 v3, s3
143 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
144 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
145 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
146 ; VI-NEXT: v_mov_b32_e32 v1, s3
147 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
148 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
149 ; VI-NEXT: flat_load_ushort v2, v[2:3]
150 ; VI-NEXT: flat_load_ushort v0, v[0:1]
151 ; VI-NEXT: s_mov_b32 s3, 0xf000
152 ; VI-NEXT: s_mov_b32 s2, -1
153 ; VI-NEXT: s_waitcnt vmcnt(0)
154 ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
155 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
156 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
159 ; GFX9-LABEL: test_umul24_i16_vgpr_sext:
161 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
162 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
163 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
164 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
166 ; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
167 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
168 ; GFX9-NEXT: s_mov_b32 s2, -1
169 ; GFX9-NEXT: s_waitcnt vmcnt(0)
170 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
171 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
172 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
173 ; GFX9-NEXT: s_endpgm
174 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
175 %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
176 %ptr_a = getelementptr i16, ptr addrspace(1) %in, i32 %tid.x
177 %ptr_b = getelementptr i16, ptr addrspace(1) %in, i32 %tid.y
178 %a = load i16, ptr addrspace(1) %ptr_a
179 %b = load i16, ptr addrspace(1) %ptr_b
180 %mul = mul i16 %a, %b
181 %val = sext i16 %mul to i32
182 store i32 %val, ptr addrspace(1) %out
186 define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) {
187 ; SI-LABEL: test_umul24_i16:
188 ; SI: ; %bb.0: ; %entry
189 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
190 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
191 ; SI-NEXT: s_mov_b32 s3, 0xf000
192 ; SI-NEXT: s_waitcnt lgkmcnt(0)
193 ; SI-NEXT: s_lshr_b32 s4, s2, 16
194 ; SI-NEXT: s_mul_i32 s2, s2, s4
195 ; SI-NEXT: s_and_b32 s4, s2, 0xffff
196 ; SI-NEXT: s_mov_b32 s2, -1
197 ; SI-NEXT: v_mov_b32_e32 v0, s4
198 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
201 ; VI-LABEL: test_umul24_i16:
202 ; VI: ; %bb.0: ; %entry
203 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
204 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
205 ; VI-NEXT: s_mov_b32 s3, 0xf000
206 ; VI-NEXT: s_mov_b32 s2, -1
207 ; VI-NEXT: s_waitcnt lgkmcnt(0)
208 ; VI-NEXT: s_lshr_b32 s5, s4, 16
209 ; VI-NEXT: s_mul_i32 s4, s4, s5
210 ; VI-NEXT: s_and_b32 s4, s4, 0xffff
211 ; VI-NEXT: v_mov_b32_e32 v0, s4
212 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
215 ; GFX9-LABEL: test_umul24_i16:
216 ; GFX9: ; %bb.0: ; %entry
217 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
218 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
219 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
220 ; GFX9-NEXT: s_mov_b32 s6, -1
221 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16
223 ; GFX9-NEXT: s_mul_i32 s2, s2, s0
224 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffff
225 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
226 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
227 ; GFX9-NEXT: s_endpgm
229 %mul = mul i16 %a, %b
230 %ext = zext i16 %mul to i32
231 store i32 %ext, ptr addrspace(1) %out
235 define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
236 ; SI-LABEL: test_umul24_i16_vgpr:
238 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
239 ; SI-NEXT: s_mov_b32 s7, 0xf000
240 ; SI-NEXT: s_mov_b32 s10, 0
241 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
242 ; SI-NEXT: v_mov_b32_e32 v3, 0
243 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
244 ; SI-NEXT: s_mov_b32 s11, s7
245 ; SI-NEXT: v_mov_b32_e32 v1, v3
246 ; SI-NEXT: s_waitcnt lgkmcnt(0)
247 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
248 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
249 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
250 ; SI-NEXT: s_mov_b32 s6, -1
251 ; SI-NEXT: s_mov_b32 s4, s0
252 ; SI-NEXT: s_mov_b32 s5, s1
253 ; SI-NEXT: s_waitcnt vmcnt(0)
254 ; SI-NEXT: v_mul_u32_u24_e32 v0, v2, v0
255 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
256 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
259 ; VI-LABEL: test_umul24_i16_vgpr:
261 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
262 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
263 ; VI-NEXT: s_waitcnt lgkmcnt(0)
264 ; VI-NEXT: v_mov_b32_e32 v3, s3
265 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
266 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
267 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
268 ; VI-NEXT: v_mov_b32_e32 v1, s3
269 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
270 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
271 ; VI-NEXT: flat_load_ushort v2, v[2:3]
272 ; VI-NEXT: flat_load_ushort v0, v[0:1]
273 ; VI-NEXT: s_mov_b32 s3, 0xf000
274 ; VI-NEXT: s_mov_b32 s2, -1
275 ; VI-NEXT: s_waitcnt vmcnt(0)
276 ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
277 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
280 ; GFX9-LABEL: test_umul24_i16_vgpr:
282 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
283 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
284 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
285 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
286 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
287 ; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
288 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
289 ; GFX9-NEXT: s_mov_b32 s2, -1
290 ; GFX9-NEXT: s_waitcnt vmcnt(0)
291 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
292 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
293 ; GFX9-NEXT: s_endpgm
294 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
295 %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
296 %ptr_a = getelementptr i16, ptr addrspace(1) %in, i32 %tid.x
297 %ptr_b = getelementptr i16, ptr addrspace(1) %in, i32 %tid.y
298 %a = load i16, ptr addrspace(1) %ptr_a
299 %b = load i16, ptr addrspace(1) %ptr_b
300 %mul = mul i16 %a, %b
301 %val = zext i16 %mul to i32
302 store i32 %val, ptr addrspace(1) %out
306 define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
307 ; SI-LABEL: test_umul24_i8_vgpr:
308 ; SI: ; %bb.0: ; %entry
309 ; SI-NEXT: v_mov_b32_e32 v3, v0
310 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
311 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
312 ; SI-NEXT: s_mov_b32 s11, 0xf000
313 ; SI-NEXT: s_mov_b32 s14, 0
314 ; SI-NEXT: v_mov_b32_e32 v4, 0
315 ; SI-NEXT: s_mov_b32 s15, s11
316 ; SI-NEXT: v_mov_b32_e32 v2, v4
317 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
319 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
320 ; SI-NEXT: buffer_load_ubyte v0, v[3:4], s[12:15], 0 addr64
321 ; SI-NEXT: buffer_load_ubyte v1, v[1:2], s[0:3], 0 addr64
322 ; SI-NEXT: s_mov_b32 s10, -1
323 ; SI-NEXT: s_mov_b32 s8, s4
324 ; SI-NEXT: s_mov_b32 s9, s5
325 ; SI-NEXT: s_waitcnt vmcnt(0)
326 ; SI-NEXT: v_mul_u32_u24_e32 v0, v0, v1
327 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
328 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
331 ; VI-LABEL: test_umul24_i8_vgpr:
332 ; VI: ; %bb.0: ; %entry
333 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
334 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
335 ; VI-NEXT: s_waitcnt lgkmcnt(0)
336 ; VI-NEXT: v_mov_b32_e32 v3, s7
337 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
338 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
339 ; VI-NEXT: v_mov_b32_e32 v4, s1
340 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1
341 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
342 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
343 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
344 ; VI-NEXT: s_mov_b32 s7, 0xf000
345 ; VI-NEXT: s_mov_b32 s6, -1
346 ; VI-NEXT: s_waitcnt vmcnt(0)
347 ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
348 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
349 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
352 ; GFX9-LABEL: test_umul24_i8_vgpr:
353 ; GFX9: ; %bb.0: ; %entry
354 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
355 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
356 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
357 ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7]
358 ; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3]
359 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
360 ; GFX9-NEXT: s_mov_b32 s6, -1
361 ; GFX9-NEXT: s_waitcnt vmcnt(0)
362 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
363 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
364 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
365 ; GFX9-NEXT: s_endpgm
367 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
368 %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
369 %a.ptr = getelementptr i8, ptr addrspace(1) %a, i32 %tid.x
370 %b.ptr = getelementptr i8, ptr addrspace(1) %b, i32 %tid.y
371 %a.l = load i8, ptr addrspace(1) %a.ptr
372 %b.l = load i8, ptr addrspace(1) %b.ptr
373 %mul = mul i8 %a.l, %b.l
374 %ext = sext i8 %mul to i32
375 store i32 %ext, ptr addrspace(1) %out
379 define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) {
380 ; SI-LABEL: test_umulhi24_i32_i64:
381 ; SI: ; %bb.0: ; %entry
382 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
383 ; SI-NEXT: s_mov_b32 s7, 0xf000
384 ; SI-NEXT: s_mov_b32 s6, -1
385 ; SI-NEXT: s_waitcnt lgkmcnt(0)
386 ; SI-NEXT: s_mov_b32 s4, s0
387 ; SI-NEXT: s_mov_b32 s5, s1
388 ; SI-NEXT: v_mov_b32_e32 v0, s3
389 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
390 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
393 ; VI-LABEL: test_umulhi24_i32_i64:
394 ; VI: ; %bb.0: ; %entry
395 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
396 ; VI-NEXT: s_mov_b32 s7, 0xf000
397 ; VI-NEXT: s_mov_b32 s6, -1
398 ; VI-NEXT: s_waitcnt lgkmcnt(0)
399 ; VI-NEXT: v_mov_b32_e32 v0, s3
400 ; VI-NEXT: s_mov_b32 s4, s0
401 ; VI-NEXT: s_mov_b32 s5, s1
402 ; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
403 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
406 ; GFX9-LABEL: test_umulhi24_i32_i64:
407 ; GFX9: ; %bb.0: ; %entry
408 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
409 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
410 ; GFX9-NEXT: s_mov_b32 s6, -1
411 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
412 ; GFX9-NEXT: s_mov_b32 s4, s0
413 ; GFX9-NEXT: s_mov_b32 s5, s1
414 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
415 ; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
416 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
417 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
418 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
419 ; GFX9-NEXT: s_endpgm
421 %a.24 = and i32 %a, 16777215
422 %b.24 = and i32 %b, 16777215
423 %a.24.i64 = zext i32 %a.24 to i64
424 %b.24.i64 = zext i32 %b.24 to i64
425 %mul48 = mul i64 %a.24.i64, %b.24.i64
426 %mul48.hi = lshr i64 %mul48, 32
427 %mul24hi = trunc i64 %mul48.hi to i32
428 store i32 %mul24hi, ptr addrspace(1) %out
432 define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) {
433 ; SI-LABEL: test_umulhi24:
434 ; SI: ; %bb.0: ; %entry
435 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
436 ; SI-NEXT: s_waitcnt lgkmcnt(0)
437 ; SI-NEXT: s_load_dword s7, s[0:1], 0xd
438 ; SI-NEXT: s_mov_b32 s3, 0xf000
439 ; SI-NEXT: s_mov_b32 s2, -1
440 ; SI-NEXT: s_mov_b32 s0, s4
441 ; SI-NEXT: s_mov_b32 s1, s5
442 ; SI-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NEXT: v_mov_b32_e32 v0, s7
444 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0
445 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
448 ; VI-LABEL: test_umulhi24:
449 ; VI: ; %bb.0: ; %entry
450 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
451 ; VI-NEXT: s_waitcnt lgkmcnt(0)
452 ; VI-NEXT: s_load_dword s7, s[0:1], 0x34
453 ; VI-NEXT: s_mov_b32 s3, 0xf000
454 ; VI-NEXT: s_mov_b32 s2, -1
455 ; VI-NEXT: s_mov_b32 s0, s4
456 ; VI-NEXT: s_waitcnt lgkmcnt(0)
457 ; VI-NEXT: v_mov_b32_e32 v0, s7
458 ; VI-NEXT: s_mov_b32 s1, s5
459 ; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0
460 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
463 ; GFX9-LABEL: test_umulhi24:
464 ; GFX9: ; %bb.0: ; %entry
465 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
466 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
468 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
469 ; GFX9-NEXT: s_mov_b32 s2, -1
470 ; GFX9-NEXT: s_mov_b32 s0, s4
471 ; GFX9-NEXT: s_mov_b32 s1, s5
472 ; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff
473 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
474 ; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff
475 ; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5
476 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
477 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
478 ; GFX9-NEXT: s_endpgm
480 %a.24 = and i64 %a, 16777215
481 %b.24 = and i64 %b, 16777215
482 %mul48 = mul i64 %a.24, %b.24
483 %mul48.hi = lshr i64 %mul48, 32
484 %mul24.hi = trunc i64 %mul48.hi to i32
485 store i32 %mul24.hi, ptr addrspace(1) %out
489 ; Multiply with 24-bit inputs and 64-bit output.
490 define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
491 ; SI-LABEL: test_umul24_i64:
492 ; SI: ; %bb.0: ; %entry
493 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
494 ; SI-NEXT: s_waitcnt lgkmcnt(0)
495 ; SI-NEXT: s_load_dword s7, s[0:1], 0xd
496 ; SI-NEXT: s_mov_b32 s3, 0xf000
497 ; SI-NEXT: s_mov_b32 s2, -1
498 ; SI-NEXT: s_mov_b32 s0, s4
499 ; SI-NEXT: s_mov_b32 s1, s5
500 ; SI-NEXT: s_and_b32 s4, s6, 0xffffff
501 ; SI-NEXT: s_waitcnt lgkmcnt(0)
502 ; SI-NEXT: s_and_b32 s5, s7, 0xffffff
503 ; SI-NEXT: v_mov_b32_e32 v0, s7
504 ; SI-NEXT: s_mul_i32 s4, s4, s5
505 ; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
506 ; SI-NEXT: v_mov_b32_e32 v0, s4
507 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
510 ; VI-LABEL: test_umul24_i64:
511 ; VI: ; %bb.0: ; %entry
512 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
513 ; VI-NEXT: s_waitcnt lgkmcnt(0)
514 ; VI-NEXT: s_load_dword s7, s[0:1], 0x34
515 ; VI-NEXT: s_mov_b32 s3, 0xf000
516 ; VI-NEXT: s_mov_b32 s2, -1
517 ; VI-NEXT: s_mov_b32 s0, s4
518 ; VI-NEXT: s_waitcnt lgkmcnt(0)
519 ; VI-NEXT: v_mov_b32_e32 v0, s7
520 ; VI-NEXT: s_mov_b32 s1, s5
521 ; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
522 ; VI-NEXT: v_mul_u32_u24_e32 v0, s6, v0
523 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
526 ; GFX9-LABEL: test_umul24_i64:
527 ; GFX9: ; %bb.0: ; %entry
528 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
529 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
531 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
532 ; GFX9-NEXT: s_mov_b32 s2, -1
533 ; GFX9-NEXT: s_mov_b32 s0, s4
534 ; GFX9-NEXT: s_mov_b32 s1, s5
535 ; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff
536 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
537 ; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff
538 ; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5
539 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
540 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
541 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
542 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
543 ; GFX9-NEXT: s_endpgm
545 %tmp0 = shl i64 %a, 40
546 %a_24 = lshr i64 %tmp0, 40
547 %tmp1 = shl i64 %b, 40
548 %b_24 = lshr i64 %tmp1, 40
549 %tmp2 = mul i64 %a_24, %b_24
550 store i64 %tmp2, ptr addrspace(1) %out
554 define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
555 ; GCN-LABEL: test_umul48_i64:
557 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
558 ; GCN-NEXT: v_mul_u32_u24_e32 v3, v0, v2
559 ; GCN-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v2
560 ; GCN-NEXT: v_mov_b32_e32 v0, v3
561 ; GCN-NEXT: s_setpc_b64 s[30:31]
562 %lhs24 = and i64 %lhs, 16777215
563 %rhs24 = and i64 %rhs, 16777215
564 %mul = mul i64 %lhs24, %rhs24
568 define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
569 ; GCN-LABEL: test_umul48_v2i64:
571 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
572 ; GCN-NEXT: v_mul_u32_u24_e32 v5, v0, v4
573 ; GCN-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v4
574 ; GCN-NEXT: v_mul_u32_u24_e32 v4, v2, v6
575 ; GCN-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v6
576 ; GCN-NEXT: v_mov_b32_e32 v0, v5
577 ; GCN-NEXT: v_mov_b32_e32 v2, v4
578 ; GCN-NEXT: s_setpc_b64 s[30:31]
579 %lhs24 = and <2 x i64> %lhs, <i64 16777215, i64 16777215>
580 %rhs24 = and <2 x i64> %rhs, <i64 16777215, i64 16777215>
581 %mul = mul <2 x i64> %lhs24, %rhs24
585 define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) {
586 ; SI-LABEL: test_umul24_i64_square:
587 ; SI: ; %bb.0: ; %entry
588 ; SI-NEXT: s_load_dword s4, s[0:1], 0x13
589 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
590 ; SI-NEXT: s_mov_b32 s3, 0xf000
591 ; SI-NEXT: s_mov_b32 s2, -1
592 ; SI-NEXT: s_waitcnt lgkmcnt(0)
593 ; SI-NEXT: s_and_b32 s5, s4, 0xffffff
594 ; SI-NEXT: s_mul_i32 s5, s5, s5
595 ; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
596 ; SI-NEXT: v_mov_b32_e32 v0, s5
597 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
600 ; VI-LABEL: test_umul24_i64_square:
601 ; VI: ; %bb.0: ; %entry
602 ; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
603 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
604 ; VI-NEXT: s_mov_b32 s3, 0xf000
605 ; VI-NEXT: s_mov_b32 s2, -1
606 ; VI-NEXT: s_waitcnt lgkmcnt(0)
607 ; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
608 ; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4
609 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
612 ; GFX9-LABEL: test_umul24_i64_square:
613 ; GFX9: ; %bb.0: ; %entry
614 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
615 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
616 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
617 ; GFX9-NEXT: s_mov_b32 s6, -1
618 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
619 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
620 ; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0
621 ; GFX9-NEXT: s_mul_i32 s0, s0, s0
622 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
623 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
624 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
625 ; GFX9-NEXT: s_endpgm
627 %tmp0 = shl i64 %a, 40
628 %a.24 = lshr i64 %tmp0, 40
629 %tmp2 = mul i64 %a.24, %a.24
630 store i64 %tmp2, ptr addrspace(1) %out
634 define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
635 ; SI-LABEL: test_umulhi16_i32:
636 ; SI: ; %bb.0: ; %entry
637 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
638 ; SI-NEXT: s_mov_b32 s7, 0xf000
639 ; SI-NEXT: s_waitcnt lgkmcnt(0)
640 ; SI-NEXT: s_and_b32 s2, s2, 0xffff
641 ; SI-NEXT: s_and_b32 s3, s3, 0xffff
642 ; SI-NEXT: s_mul_i32 s2, s2, s3
643 ; SI-NEXT: s_lshr_b32 s2, s2, 16
644 ; SI-NEXT: s_mov_b32 s6, -1
645 ; SI-NEXT: s_mov_b32 s4, s0
646 ; SI-NEXT: s_mov_b32 s5, s1
647 ; SI-NEXT: v_mov_b32_e32 v0, s2
648 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
651 ; VI-LABEL: test_umulhi16_i32:
652 ; VI: ; %bb.0: ; %entry
653 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
654 ; VI-NEXT: s_mov_b32 s7, 0xf000
655 ; VI-NEXT: s_mov_b32 s6, -1
656 ; VI-NEXT: s_waitcnt lgkmcnt(0)
657 ; VI-NEXT: s_mov_b32 s4, s0
658 ; VI-NEXT: s_mov_b32 s5, s1
659 ; VI-NEXT: s_and_b32 s0, s2, 0xffff
660 ; VI-NEXT: s_and_b32 s1, s3, 0xffff
661 ; VI-NEXT: s_mul_i32 s0, s0, s1
662 ; VI-NEXT: s_lshr_b32 s0, s0, 16
663 ; VI-NEXT: v_mov_b32_e32 v0, s0
664 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
667 ; GFX9-LABEL: test_umulhi16_i32:
668 ; GFX9: ; %bb.0: ; %entry
669 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
670 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
671 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
672 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
673 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
674 ; GFX9-NEXT: s_mul_i32 s2, s2, s3
675 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
676 ; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1]
677 ; GFX9-NEXT: s_endpgm
679 %a.16 = and i32 %a, 65535
680 %b.16 = and i32 %b, 65535
681 %mul = mul i32 %a.16, %b.16
682 %hi = lshr i32 %mul, 16
683 %mulhi = trunc i32 %hi to i16
684 store i16 %mulhi, ptr addrspace(1) %out
688 define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
689 ; SI-LABEL: test_umul24_i33:
690 ; SI: ; %bb.0: ; %entry
691 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
692 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
693 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
694 ; SI-NEXT: s_mov_b32 s7, 0xf000
695 ; SI-NEXT: s_mov_b32 s6, -1
696 ; SI-NEXT: s_waitcnt lgkmcnt(0)
697 ; SI-NEXT: s_and_b32 s1, s2, 0xffffff
698 ; SI-NEXT: s_and_b32 s3, s0, 0xffffff
699 ; SI-NEXT: v_mov_b32_e32 v0, s0
700 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
701 ; SI-NEXT: s_mul_i32 s1, s1, s3
702 ; SI-NEXT: v_and_b32_e32 v1, 1, v0
703 ; SI-NEXT: v_mov_b32_e32 v0, s1
704 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
707 ; VI-LABEL: test_umul24_i33:
708 ; VI: ; %bb.0: ; %entry
709 ; VI-NEXT: s_load_dword s4, s[0:1], 0x34
710 ; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
711 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
712 ; VI-NEXT: s_mov_b32 s3, 0xf000
713 ; VI-NEXT: s_mov_b32 s2, -1
714 ; VI-NEXT: s_waitcnt lgkmcnt(0)
715 ; VI-NEXT: v_mov_b32_e32 v1, s4
716 ; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1
717 ; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1
718 ; VI-NEXT: v_and_b32_e32 v1, 1, v1
719 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
722 ; GFX9-LABEL: test_umul24_i33:
723 ; GFX9: ; %bb.0: ; %entry
724 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
725 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
726 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
727 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
728 ; GFX9-NEXT: s_mov_b32 s6, -1
729 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
731 ; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
732 ; GFX9-NEXT: s_mul_i32 s2, s0, s1
733 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
734 ; GFX9-NEXT: s_and_b32 s0, s0, 1
735 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
736 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
737 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
738 ; GFX9-NEXT: s_endpgm
740 %tmp0 = shl i33 %a, 9
741 %a_24 = lshr i33 %tmp0, 9
742 %tmp1 = shl i33 %b, 9
743 %b_24 = lshr i33 %tmp1, 9
744 %tmp2 = mul i33 %a_24, %b_24
745 %ext = zext i33 %tmp2 to i64
746 store i64 %ext, ptr addrspace(1) %out
750 define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
751 ; SI-LABEL: test_umulhi24_i33:
752 ; SI: ; %bb.0: ; %entry
753 ; SI-NEXT: s_load_dword s4, s[0:1], 0xd
754 ; SI-NEXT: s_load_dword s5, s[0:1], 0xb
755 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
756 ; SI-NEXT: s_mov_b32 s3, 0xf000
757 ; SI-NEXT: s_mov_b32 s2, -1
758 ; SI-NEXT: s_waitcnt lgkmcnt(0)
759 ; SI-NEXT: v_mov_b32_e32 v0, s4
760 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
761 ; SI-NEXT: v_and_b32_e32 v0, 1, v0
762 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
765 ; VI-LABEL: test_umulhi24_i33:
766 ; VI: ; %bb.0: ; %entry
767 ; VI-NEXT: s_load_dword s4, s[0:1], 0x34
768 ; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
769 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
770 ; VI-NEXT: s_mov_b32 s3, 0xf000
771 ; VI-NEXT: s_mov_b32 s2, -1
772 ; VI-NEXT: s_waitcnt lgkmcnt(0)
773 ; VI-NEXT: v_mov_b32_e32 v0, s4
774 ; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
775 ; VI-NEXT: v_and_b32_e32 v0, 1, v0
776 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
779 ; GFX9-LABEL: test_umulhi24_i33:
780 ; GFX9: ; %bb.0: ; %entry
781 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
782 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
783 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
784 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
785 ; GFX9-NEXT: s_mov_b32 s6, -1
786 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
787 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
788 ; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
789 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
790 ; GFX9-NEXT: s_and_b32 s0, s0, 1
791 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
792 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
793 ; GFX9-NEXT: s_endpgm
795 %tmp0 = shl i33 %a, 9
796 %a_24 = lshr i33 %tmp0, 9
797 %tmp1 = shl i33 %b, 9
798 %b_24 = lshr i33 %tmp1, 9
799 %tmp2 = mul i33 %a_24, %b_24
800 %hi = lshr i33 %tmp2, 32
801 %trunc = trunc i33 %hi to i32
802 store i32 %trunc, ptr addrspace(1) %out
807 ; Make sure the created any_extend is ignored to use the real bits
809 define i17 @test_umul24_anyextend_i24_src0_src1(i24 %a, i24 %b) {
810 ; GCN-LABEL: test_umul24_anyextend_i24_src0_src1:
811 ; GCN: ; %bb.0: ; %entry
812 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
813 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0
814 ; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1
815 ; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1
816 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0
817 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0
818 ; GCN-NEXT: s_setpc_b64 s[30:31]
820 %aa = mul i24 %a, 234
821 %bb = mul i24 %b, 923
822 %a_32 = zext i24 %aa to i32
823 %b_32 = zext i24 %bb to i32
824 %mul = mul i32 %a_32, %b_32
825 %trunc = trunc i32 %mul to i17
826 %arst = mul i17 %trunc, 99
830 define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
831 ; GCN-LABEL: test_umul24_anyextend_i23_src0_src1:
832 ; GCN: ; %bb.0: ; %entry
833 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
835 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
836 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0
837 ; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1
838 ; GCN-NEXT: v_and_b32_e32 v0, 0x7ffffe, v0
839 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
840 ; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1
841 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0
842 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0
843 ; GCN-NEXT: s_setpc_b64 s[30:31]
845 %aa = mul i23 %a, 234
846 %bb = mul i23 %b, 923
847 %a_32 = zext i23 %aa to i32
848 %b_32 = zext i23 %bb to i32
849 %mul = mul i32 %a_32, %b_32
850 %trunc = trunc i32 %mul to i17
851 %arst = mul i17 %trunc, 99