1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=CM %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
5 define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
6 ; CM-LABEL: test_umul24_i32:
7 ; CM: ; %bb.0: ; %entry
8 ; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
9 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
12 ; CM-NEXT: ALU clause starting at 4:
13 ; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
14 ; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
15 ; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.y,
16 ; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
17 ; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
18 ; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
19 ; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
20 ; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
22 ; EG-LABEL: test_umul24_i32:
23 ; EG: ; %bb.0: ; %entry
24 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
25 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
28 ; EG-NEXT: ALU clause starting at 4:
29 ; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
30 ; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.x,
31 ; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
32 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
33 ; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
34 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
37 %a_24 = lshr i32 %0, 8
39 %b_24 = lshr i32 %1, 8
40 %2 = mul i32 %a_24, %b_24
41 store i32 %2, ptr addrspace(1) %out
45 ; The result must be sign-extended.
46 define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) {
47 ; CM-LABEL: test_umul24_i16_sext:
48 ; CM: ; %bb.0: ; %entry
49 ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
51 ; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
52 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
55 ; CM-NEXT: Fetch clause starting at 6:
56 ; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
57 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
58 ; CM-NEXT: ALU clause starting at 10:
59 ; CM-NEXT: MOV * T0.X, 0.0,
60 ; CM-NEXT: ALU clause starting at 11:
61 ; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X,
62 ; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X,
63 ; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X,
64 ; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X,
65 ; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x,
66 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
67 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
68 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
70 ; EG-LABEL: test_umul24_i16_sext:
71 ; EG: ; %bb.0: ; %entry
72 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
74 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
75 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
78 ; EG-NEXT: Fetch clause starting at 6:
79 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
80 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
81 ; EG-NEXT: ALU clause starting at 10:
82 ; EG-NEXT: MOV * T0.X, 0.0,
83 ; EG-NEXT: ALU clause starting at 11:
84 ; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
85 ; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
86 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
87 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
90 %ext = sext i16 %mul to i32
91 store i32 %ext, ptr addrspace(1) %out
95 ; The result must be sign-extended.
96 define amdgpu_kernel void @test_umul24_i8(ptr addrspace(1) %out, i8 %a, i8 %b) {
97 ; CM-LABEL: test_umul24_i8:
98 ; CM: ; %bb.0: ; %entry
99 ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
101 ; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
102 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
105 ; CM-NEXT: Fetch clause starting at 6:
106 ; CM-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
107 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
108 ; CM-NEXT: ALU clause starting at 10:
109 ; CM-NEXT: MOV * T0.X, 0.0,
110 ; CM-NEXT: ALU clause starting at 11:
111 ; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X,
112 ; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X,
113 ; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X,
114 ; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X,
115 ; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x,
116 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
117 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
118 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
120 ; EG-LABEL: test_umul24_i8:
121 ; EG: ; %bb.0: ; %entry
122 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
124 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
125 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
128 ; EG-NEXT: Fetch clause starting at 6:
129 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
130 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
131 ; EG-NEXT: ALU clause starting at 10:
132 ; EG-NEXT: MOV * T0.X, 0.0,
133 ; EG-NEXT: ALU clause starting at 11:
134 ; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
135 ; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
136 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
137 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
140 %ext = sext i8 %mul to i32
141 store i32 %ext, ptr addrspace(1) %out
145 define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) {
146 ; CM-LABEL: test_umulhi24_i32_i64:
147 ; CM: ; %bb.0: ; %entry
148 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
149 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
152 ; CM-NEXT: ALU clause starting at 4:
153 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
154 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
155 ; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].Z, KC0[2].W,
156 ; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
157 ; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
158 ; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
160 ; EG-LABEL: test_umulhi24_i32_i64:
161 ; EG: ; %bb.0: ; %entry
162 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
163 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
166 ; EG-NEXT: ALU clause starting at 4:
167 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
168 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
169 ; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].Z, KC0[2].W,
171 %a.24 = and i32 %a, 16777215
172 %b.24 = and i32 %b, 16777215
173 %a.24.i64 = zext i32 %a.24 to i64
174 %b.24.i64 = zext i32 %b.24 to i64
175 %mul48 = mul i64 %a.24.i64, %b.24.i64
176 %mul48.hi = lshr i64 %mul48, 32
177 %mul24hi = trunc i64 %mul48.hi to i32
178 store i32 %mul24hi, ptr addrspace(1) %out
182 define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) {
183 ; CM-LABEL: test_umulhi24:
184 ; CM: ; %bb.0: ; %entry
185 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
186 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
189 ; CM-NEXT: ALU clause starting at 4:
190 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
191 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
192 ; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].W, KC0[3].Y,
193 ; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].W, KC0[3].Y,
194 ; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
195 ; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
197 ; EG-LABEL: test_umulhi24:
198 ; EG: ; %bb.0: ; %entry
199 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
200 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
203 ; EG-NEXT: ALU clause starting at 4:
204 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
205 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
206 ; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].W, KC0[3].Y,
208 %a.24 = and i64 %a, 16777215
209 %b.24 = and i64 %b, 16777215
210 %mul48 = mul i64 %a.24, %b.24
211 %mul48.hi = lshr i64 %mul48, 32
212 %mul24.hi = trunc i64 %mul48.hi to i32
213 store i32 %mul24.hi, ptr addrspace(1) %out
217 ; Multiply with 24-bit inputs and 64-bit output.
218 define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
219 ; CM-LABEL: test_umul24_i64:
220 ; CM: ; %bb.0: ; %entry
221 ; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
222 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
225 ; CM-NEXT: ALU clause starting at 4:
226 ; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
227 ; CM-NEXT: AND_INT * T0.Z, KC0[3].Y, literal.y,
228 ; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
229 ; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x,
230 ; CM-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
231 ; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
232 ; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
233 ; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
234 ; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
235 ; CM-NEXT: MULHI_UINT24 T1.X (MASKED), KC0[2].W, KC0[3].Y,
236 ; CM-NEXT: MULHI_UINT24 T1.Y, KC0[2].W, KC0[3].Y,
237 ; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
238 ; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
240 ; EG-LABEL: test_umul24_i64:
241 ; EG: ; %bb.0: ; %entry
242 ; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
243 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
246 ; EG-NEXT: ALU clause starting at 4:
247 ; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
248 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
249 ; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
250 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
251 ; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
252 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
253 ; EG-NEXT: MULHI_UINT24 * T1.Y, KC0[2].W, KC0[3].Y,
255 %tmp0 = shl i64 %a, 40
256 %a_24 = lshr i64 %tmp0, 40
257 %tmp1 = shl i64 %b, 40
258 %b_24 = lshr i64 %tmp1, 40
259 %tmp2 = mul i64 %a_24, %b_24
260 store i64 %tmp2, ptr addrspace(1) %out