1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
8 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
9 define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
10 ; SI-LABEL: test_smul24_i32:
11 ; SI: ; %bb.0: ; %entry
12 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
14 ; SI-NEXT: s_mov_b32 s3, 0xf000
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_bfe_i32 s2, s4, 0x180000
17 ; SI-NEXT: s_bfe_i32 s4, s5, 0x180000
18 ; SI-NEXT: s_mul_i32 s4, s2, s4
19 ; SI-NEXT: s_mov_b32 s2, -1
20 ; SI-NEXT: v_mov_b32_e32 v0, s4
21 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
24 ; VI-LABEL: test_smul24_i32:
25 ; VI: ; %bb.0: ; %entry
26 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
28 ; VI-NEXT: s_mov_b32 s7, 0xf000
29 ; VI-NEXT: s_mov_b32 s6, -1
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
32 ; VI-NEXT: s_bfe_i32 s1, s1, 0x180000
33 ; VI-NEXT: s_mul_i32 s0, s0, s1
34 ; VI-NEXT: v_mov_b32_e32 v0, s0
35 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
38 ; GFX9-LABEL: test_smul24_i32:
39 ; GFX9: ; %bb.0: ; %entry
40 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
41 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
42 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
43 ; GFX9-NEXT: s_mov_b32 s6, -1
44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
45 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
46 ; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
47 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
48 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
49 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
52 ; EG-LABEL: test_smul24_i32:
53 ; EG: ; %bb.0: ; %entry
54 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
55 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
58 ; EG-NEXT: ALU clause starting at 4:
59 ; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
60 ; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
61 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
62 ; EG-NEXT: ASHR T1.W, PS, literal.x,
63 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
64 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
65 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
66 ; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
67 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
69 ; CM-LABEL: test_smul24_i32:
70 ; CM: ; %bb.0: ; %entry
71 ; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
72 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
75 ; CM-NEXT: ALU clause starting at 4:
76 ; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
77 ; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
78 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
79 ; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
80 ; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
81 ; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
82 ; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
83 ; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
84 ; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
85 ; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
86 ; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
88 %a.shl = shl i32 %a, 8
89 %a.24 = ashr i32 %a.shl, 8
90 %b.shl = shl i32 %b, 8
91 %b.24 = ashr i32 %b.shl, 8
92 %mul24 = mul i32 %a.24, %b.24
93 store i32 %mul24, i32 addrspace(1)* %out
97 define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
98 ; SI-LABEL: test_smulhi24_i64:
99 ; SI: ; %bb.0: ; %entry
100 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
101 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
102 ; SI-NEXT: s_mov_b32 s3, 0xf000
103 ; SI-NEXT: s_mov_b32 s2, -1
104 ; SI-NEXT: s_waitcnt lgkmcnt(0)
105 ; SI-NEXT: v_mov_b32_e32 v0, s5
106 ; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0
107 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
110 ; VI-LABEL: test_smulhi24_i64:
111 ; VI: ; %bb.0: ; %entry
112 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
113 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
114 ; VI-NEXT: s_mov_b32 s7, 0xf000
115 ; VI-NEXT: s_mov_b32 s6, -1
116 ; VI-NEXT: s_waitcnt lgkmcnt(0)
117 ; VI-NEXT: v_mov_b32_e32 v0, s1
118 ; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s0, v0
119 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
122 ; GFX9-LABEL: test_smulhi24_i64:
123 ; GFX9: ; %bb.0: ; %entry
124 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
125 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
126 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
127 ; GFX9-NEXT: s_mov_b32 s6, -1
128 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
130 ; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
131 ; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1
132 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
133 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
134 ; GFX9-NEXT: s_endpgm
136 ; EG-LABEL: test_smulhi24_i64:
137 ; EG: ; %bb.0: ; %entry
138 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
139 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
142 ; EG-NEXT: ALU clause starting at 4:
143 ; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
144 ; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
145 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
146 ; EG-NEXT: ASHR T1.W, PS, literal.x,
147 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
148 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
149 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
150 ; EG-NEXT: MULHI_INT * T1.X, PS, PV.W,
151 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
153 ; CM-LABEL: test_smulhi24_i64:
154 ; CM: ; %bb.0: ; %entry
155 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
156 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
159 ; CM-NEXT: ALU clause starting at 4:
160 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
161 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
162 ; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
163 ; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
164 ; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
165 ; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
167 %a.shl = shl i32 %a, 8
168 %a.24 = ashr i32 %a.shl, 8
169 %b.shl = shl i32 %b, 8
170 %b.24 = ashr i32 %b.shl, 8
171 %a.24.i64 = sext i32 %a.24 to i64
172 %b.24.i64 = sext i32 %b.24 to i64
173 %mul48 = mul i64 %a.24.i64, %b.24.i64
174 %mul48.hi = lshr i64 %mul48, 32
175 %mul24hi = trunc i64 %mul48.hi to i32
176 store i32 %mul24hi, i32 addrspace(1)* %out
180 ; This requires handling of the original 64-bit mul node to eliminate
181 ; unnecessary extension instructions because after legalization they
182 ; will not be removed by SimplifyDemandedBits because there are
183 ; multiple uses by the separate mul and mulhi.
184 define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
185 ; SI-LABEL: test_smul24_i64:
187 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
188 ; SI-NEXT: s_load_dword s2, s[0:1], 0x13
189 ; SI-NEXT: s_load_dword s0, s[0:1], 0x1c
190 ; SI-NEXT: s_mov_b32 s7, 0xf000
191 ; SI-NEXT: s_mov_b32 s6, -1
192 ; SI-NEXT: s_waitcnt lgkmcnt(0)
193 ; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
194 ; SI-NEXT: s_bfe_i32 s0, s0, 0x180000
195 ; SI-NEXT: v_mov_b32_e32 v0, s1
196 ; SI-NEXT: s_mul_i32 s1, s0, s1
197 ; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
198 ; SI-NEXT: v_mov_b32_e32 v0, s1
199 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
202 ; VI-LABEL: test_smul24_i64:
204 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
205 ; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
206 ; VI-NEXT: s_load_dword s0, s[0:1], 0x70
207 ; VI-NEXT: s_mov_b32 s7, 0xf000
208 ; VI-NEXT: s_mov_b32 s6, -1
209 ; VI-NEXT: s_waitcnt lgkmcnt(0)
210 ; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
211 ; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
212 ; VI-NEXT: v_mov_b32_e32 v0, s1
213 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
214 ; VI-NEXT: s_mul_i32 s0, s0, s1
215 ; VI-NEXT: v_mov_b32_e32 v0, s0
216 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
219 ; GFX9-LABEL: test_smul24_i64:
221 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
222 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
223 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
224 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
225 ; GFX9-NEXT: s_mov_b32 s6, -1
226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
228 ; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
229 ; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0
230 ; GFX9-NEXT: s_mul_i32 s1, s1, s0
231 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
232 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
233 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
234 ; GFX9-NEXT: s_endpgm
236 ; EG-LABEL: test_smul24_i64:
238 ; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
239 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
242 ; EG-NEXT: ALU clause starting at 4:
243 ; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x,
244 ; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x,
245 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
246 ; EG-NEXT: ASHR T1.W, PS, literal.x,
247 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
248 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
249 ; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS,
250 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
251 ; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W,
252 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
254 ; CM-LABEL: test_smul24_i64:
256 ; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
257 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
260 ; CM-NEXT: ALU clause starting at 4:
261 ; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x,
262 ; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x,
263 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
264 ; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
265 ; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
266 ; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
267 ; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
268 ; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W,
269 ; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
270 ; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
271 ; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
272 ; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
273 ; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
274 ; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
275 ; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
276 %shl.i = shl i32 %a, 8
277 %shr.i = ashr i32 %shl.i, 8
278 %conv.i = sext i32 %shr.i to i64
279 %shl1.i = shl i32 %b, 8
280 %shr2.i = ashr i32 %shl1.i, 8
281 %conv3.i = sext i32 %shr2.i to i64
282 %mul.i = mul i64 %conv3.i, %conv.i
283 store i64 %mul.i, i64 addrspace(1)* %out
287 define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
288 ; SI-LABEL: test_smul24_i64_square:
290 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
291 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
292 ; SI-NEXT: s_mov_b32 s3, 0xf000
293 ; SI-NEXT: s_mov_b32 s2, -1
294 ; SI-NEXT: s_waitcnt lgkmcnt(0)
295 ; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
296 ; SI-NEXT: s_mul_i32 s5, s4, s4
297 ; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
298 ; SI-NEXT: v_mov_b32_e32 v0, s5
299 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
302 ; VI-LABEL: test_smul24_i64_square:
304 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
305 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
306 ; VI-NEXT: s_mov_b32 s7, 0xf000
307 ; VI-NEXT: s_mov_b32 s6, -1
308 ; VI-NEXT: s_waitcnt lgkmcnt(0)
309 ; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
310 ; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0
311 ; VI-NEXT: s_mul_i32 s0, s0, s0
312 ; VI-NEXT: v_mov_b32_e32 v0, s0
313 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
316 ; GFX9-LABEL: test_smul24_i64_square:
318 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
319 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
320 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
321 ; GFX9-NEXT: s_mov_b32 s6, -1
322 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
324 ; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0
325 ; GFX9-NEXT: s_mul_i32 s0, s0, s0
326 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
327 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
328 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
329 ; GFX9-NEXT: s_endpgm
331 ; EG-LABEL: test_smul24_i64_square:
333 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
334 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
337 ; EG-NEXT: ALU clause starting at 4:
338 ; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
339 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
340 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
341 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
342 ; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W,
343 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
344 ; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W,
345 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
347 ; CM-LABEL: test_smul24_i64_square:
349 ; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
350 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
353 ; CM-NEXT: ALU clause starting at 4:
354 ; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
355 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
356 ; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
357 ; CM-NEXT: ASHR * T0.W, PV.W, literal.y,
358 ; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
359 ; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W,
360 ; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W,
361 ; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W,
362 ; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W,
363 ; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
364 ; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
365 ; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
366 ; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
367 %shl.i = shl i32 %a, 8
368 %shr.i = ashr i32 %shl.i, 8
369 %conv.i = sext i32 %shr.i to i64
370 %mul.i = mul i64 %conv.i, %conv.i
371 store i64 %mul.i, i64 addrspace(1)* %out
375 define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
376 ; SI-LABEL: test_smul24_i33:
377 ; SI: ; %bb.0: ; %entry
378 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
379 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
380 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
381 ; SI-NEXT: s_mov_b32 s7, 0xf000
382 ; SI-NEXT: s_mov_b32 s6, -1
383 ; SI-NEXT: s_waitcnt lgkmcnt(0)
384 ; SI-NEXT: s_lshl_b32 s1, s2, 8
385 ; SI-NEXT: s_lshl_b32 s3, s0, 8
386 ; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
387 ; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
388 ; SI-NEXT: v_mov_b32_e32 v0, s2
389 ; SI-NEXT: s_mul_i32 s1, s0, s2
390 ; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
391 ; SI-NEXT: v_mov_b32_e32 v0, s1
392 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
393 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
394 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
397 ; VI-LABEL: test_smul24_i33:
398 ; VI: ; %bb.0: ; %entry
399 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
400 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
401 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
402 ; VI-NEXT: s_mov_b32 s7, 0xf000
403 ; VI-NEXT: s_mov_b32 s6, -1
404 ; VI-NEXT: s_waitcnt lgkmcnt(0)
405 ; VI-NEXT: s_lshl_b32 s1, s2, 8
406 ; VI-NEXT: s_lshl_b32 s3, s0, 8
407 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
408 ; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
409 ; VI-NEXT: v_mov_b32_e32 v0, s2
410 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
411 ; VI-NEXT: s_mul_i32 s0, s0, s2
412 ; VI-NEXT: v_mov_b32_e32 v0, s0
413 ; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
414 ; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
415 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
418 ; GFX9-LABEL: test_smul24_i33:
419 ; GFX9: ; %bb.0: ; %entry
420 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
421 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
422 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
423 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
424 ; GFX9-NEXT: s_mov_b32 s6, -1
425 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX9-NEXT: s_lshl_b32 s1, s2, 8
427 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
428 ; GFX9-NEXT: s_lshl_b32 s1, s3, 8
429 ; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
430 ; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2
431 ; GFX9-NEXT: s_mul_i32 s0, s0, s2
432 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31
433 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31
434 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
435 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
436 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
437 ; GFX9-NEXT: s_endpgm
439 ; EG-LABEL: test_smul24_i33:
440 ; EG: ; %bb.0: ; %entry
441 ; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
442 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
445 ; EG-NEXT: ALU clause starting at 4:
446 ; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
447 ; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
448 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
449 ; EG-NEXT: ASHR T1.W, PS, literal.x,
450 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
451 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
452 ; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
453 ; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W,
454 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
455 ; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
456 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
458 ; CM-LABEL: test_smul24_i33:
459 ; CM: ; %bb.0: ; %entry
460 ; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[]
461 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
464 ; CM-NEXT: ALU clause starting at 4:
465 ; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
466 ; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
467 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
468 ; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
469 ; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
470 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
471 ; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
472 ; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
473 ; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
474 ; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
475 ; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
476 ; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
477 ; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
478 ; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
479 ; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
480 ; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
481 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
483 %a.shl = shl i33 %a, 9
484 %a.24 = ashr i33 %a.shl, 9
485 %b.shl = shl i33 %b, 9
486 %b.24 = ashr i33 %b.shl, 9
487 %mul24 = mul i33 %a.24, %b.24
488 %ext = sext i33 %mul24 to i64
489 store i64 %ext, i64 addrspace(1)* %out
493 define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
494 ; SI-LABEL: test_smulhi24_i33:
495 ; SI: ; %bb.0: ; %entry
496 ; SI-NEXT: s_load_dword s4, s[0:1], 0xd
497 ; SI-NEXT: s_load_dword s5, s[0:1], 0xb
498 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
499 ; SI-NEXT: s_mov_b32 s3, 0xf000
500 ; SI-NEXT: s_mov_b32 s2, -1
501 ; SI-NEXT: s_waitcnt lgkmcnt(0)
502 ; SI-NEXT: v_mov_b32_e32 v0, s4
503 ; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
504 ; SI-NEXT: v_and_b32_e32 v0, 1, v0
505 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
508 ; VI-LABEL: test_smulhi24_i33:
509 ; VI: ; %bb.0: ; %entry
510 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
511 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
512 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
513 ; VI-NEXT: s_mov_b32 s7, 0xf000
514 ; VI-NEXT: s_mov_b32 s6, -1
515 ; VI-NEXT: s_waitcnt lgkmcnt(0)
516 ; VI-NEXT: v_mov_b32_e32 v0, s0
517 ; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
518 ; VI-NEXT: v_and_b32_e32 v0, 1, v0
519 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
522 ; GFX9-LABEL: test_smulhi24_i33:
523 ; GFX9: ; %bb.0: ; %entry
524 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
525 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
526 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
527 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
528 ; GFX9-NEXT: s_mov_b32 s6, -1
529 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX9-NEXT: s_lshl_b32 s1, s2, 8
531 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
532 ; GFX9-NEXT: s_lshl_b32 s1, s3, 8
533 ; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
534 ; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2
535 ; GFX9-NEXT: s_and_b32 s0, s0, 1
536 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
537 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
538 ; GFX9-NEXT: s_endpgm
540 ; EG-LABEL: test_smulhi24_i33:
541 ; EG: ; %bb.0: ; %entry
542 ; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
543 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
546 ; EG-NEXT: ALU clause starting at 4:
547 ; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
548 ; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
549 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
550 ; EG-NEXT: ASHR T1.W, PS, literal.x,
551 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
552 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
553 ; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
554 ; EG-NEXT: AND_INT T0.X, PS, 1,
555 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
556 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
558 ; CM-LABEL: test_smulhi24_i33:
559 ; CM: ; %bb.0: ; %entry
560 ; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
561 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
564 ; CM-NEXT: ALU clause starting at 4:
565 ; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
566 ; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
567 ; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
568 ; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
569 ; CM-NEXT: AND_INT * T0.X, PV.X, 1,
570 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
571 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
573 %tmp0 = shl i33 %a, 9
574 %a_24 = ashr i33 %tmp0, 9
575 %tmp1 = shl i33 %b, 9
576 %b_24 = ashr i33 %tmp1, 9
577 %tmp2 = mul i33 %a_24, %b_24
578 %hi = lshr i33 %tmp2, 32
579 %trunc = trunc i33 %hi to i32
581 store i32 %trunc, i32 addrspace(1)* %out
585 define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
586 ; SI-LABEL: simplify_i24_crash:
588 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
589 ; SI-NEXT: s_waitcnt lgkmcnt(0)
590 ; SI-NEXT: s_cmp_lg_u32 s2, 0
591 ; SI-NEXT: s_cbranch_scc0 BB6_2
592 ; SI-NEXT: ; %bb.1: ; %bb7
594 ; SI-NEXT: BB6_2: ; %bb11
595 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
596 ; SI-NEXT: s_load_dword s4, s[0:1], 0xf
597 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
598 ; SI-NEXT: s_mov_b32 s3, 0xf000
599 ; SI-NEXT: s_waitcnt lgkmcnt(0)
600 ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000
601 ; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
602 ; SI-NEXT: s_mul_i32 s4, s2, s4
603 ; SI-NEXT: s_mov_b32 s2, -1
604 ; SI-NEXT: v_mov_b32_e32 v0, s4
605 ; SI-NEXT: v_mov_b32_e32 v1, s4
606 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
609 ; VI-LABEL: simplify_i24_crash:
611 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
612 ; VI-NEXT: s_waitcnt lgkmcnt(0)
613 ; VI-NEXT: s_cmp_lg_u32 s2, 0
614 ; VI-NEXT: s_cbranch_scc0 BB6_2
615 ; VI-NEXT: ; %bb.1: ; %bb7
617 ; VI-NEXT: BB6_2: ; %bb11
618 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
619 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34
620 ; VI-NEXT: s_load_dword s0, s[0:1], 0x3c
621 ; VI-NEXT: s_mov_b32 s7, 0xf000
622 ; VI-NEXT: s_mov_b32 s6, -1
623 ; VI-NEXT: s_waitcnt lgkmcnt(0)
624 ; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
625 ; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
626 ; VI-NEXT: s_mul_i32 s1, s1, s0
627 ; VI-NEXT: v_mov_b32_e32 v0, s1
628 ; VI-NEXT: v_mov_b32_e32 v1, s1
629 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
632 ; GFX9-LABEL: simplify_i24_crash:
633 ; GFX9: ; %bb.0: ; %bb
634 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
635 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
636 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0
637 ; GFX9-NEXT: s_cbranch_scc0 BB6_2
638 ; GFX9-NEXT: ; %bb.1: ; %bb7
639 ; GFX9-NEXT: s_endpgm
640 ; GFX9-NEXT: BB6_2: ; %bb11
641 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
642 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
643 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c
644 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
645 ; GFX9-NEXT: s_mov_b32 s6, -1
646 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
648 ; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
649 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
650 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
651 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
652 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
653 ; GFX9-NEXT: s_endpgm
655 ; EG-LABEL: simplify_i24_crash:
657 ; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
658 ; EG-NEXT: JUMP @5 POP:1
659 ; EG-NEXT: ALU 10, @8, KC0[CB0:0-32], KC1[]
660 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
661 ; EG-NEXT: POP @5 POP:1
663 ; EG-NEXT: ALU clause starting at 6:
664 ; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
665 ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
666 ; EG-NEXT: ALU clause starting at 8:
667 ; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
668 ; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
669 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
670 ; EG-NEXT: ASHR T1.W, PS, literal.x,
671 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
672 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
673 ; EG-NEXT: MOV T2.W, KC0[2].Y,
674 ; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
675 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
676 ; EG-NEXT: MOV * T0.Y, PS,
677 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
679 ; CM-LABEL: simplify_i24_crash:
681 ; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
682 ; CM-NEXT: JUMP @5 POP:1
683 ; CM-NEXT: ALU 13, @8, KC0[CB0:0-32], KC1[]
684 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
685 ; CM-NEXT: POP @5 POP:1
687 ; CM-NEXT: ALU clause starting at 6:
688 ; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
689 ; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
690 ; CM-NEXT: ALU clause starting at 8:
691 ; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
692 ; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
693 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
694 ; CM-NEXT: MOV T0.Y, KC0[2].Y,
695 ; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
696 ; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
697 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
698 ; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z,
699 ; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
700 ; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
701 ; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
702 ; CM-NEXT: LSHR T1.X, T0.Y, literal.x,
703 ; CM-NEXT: MOV * T0.Y, PV.X,
704 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
706 %cmp = icmp eq i32 %arg0, 0
707 br i1 %cmp, label %bb11, label %bb7
710 %tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer
711 %tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer
712 %tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8>
713 %tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8>
714 %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
715 %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
716 %tmp21 = mul <2 x i32> %tmp18, %tmp20
717 store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out
724 attributes #0 = { nounwind }