1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
9 define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
10 ; GFX7-LABEL: udot8_acc32:
11 ; GFX7: ; %bb.0: ; %entry
12 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
13 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
14 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
15 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
16 ; GFX7-NEXT: s_mov_b32 s14, -1
17 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
18 ; GFX7-NEXT: s_add_u32 s12, s12, s3
19 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
20 ; GFX7-NEXT: s_mov_b32 s10, 0
21 ; GFX7-NEXT: s_mov_b32 s11, s3
22 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
23 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
24 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
25 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
26 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
27 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
28 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
29 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
30 ; GFX7-NEXT: s_mov_b32 s2, -1
31 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
32 ; GFX7-NEXT: s_waitcnt vmcnt(1)
33 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
34 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
35 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
36 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
37 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
38 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
39 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
40 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
41 ; GFX7-NEXT: s_waitcnt vmcnt(0)
42 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
43 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
44 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
45 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
46 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
47 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
48 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
49 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
51 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
52 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
53 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
54 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
55 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
56 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
57 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
58 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
59 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
62 ; GFX8-LABEL: udot8_acc32:
63 ; GFX8: ; %bb.0: ; %entry
64 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
65 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
66 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
67 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
68 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
71 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
72 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
73 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
74 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
75 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
76 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
77 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
78 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
79 ; GFX8-NEXT: s_mov_b32 s10, -1
80 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
81 ; GFX8-NEXT: s_add_u32 s8, s8, s3
82 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
83 ; GFX8-NEXT: s_waitcnt vmcnt(1)
84 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
85 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
86 ; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4
87 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4
88 ; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4
89 ; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4
90 ; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4
91 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
92 ; GFX8-NEXT: s_waitcnt vmcnt(0)
93 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0
94 ; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4
95 ; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4
96 ; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4
97 ; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4
98 ; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4
99 ; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
100 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
101 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
103 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
104 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
105 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
106 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0
107 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
108 ; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
109 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
110 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
111 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
112 ; GFX8-NEXT: flat_store_dword v[0:1], v2
113 ; GFX8-NEXT: s_endpgm
115 ; GFX9-LABEL: udot8_acc32:
116 ; GFX9: ; %bb.0: ; %entry
117 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
118 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
119 ; GFX9-NEXT: s_mov_b32 s10, -1
120 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
121 ; GFX9-NEXT: s_add_u32 s8, s8, s3
122 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
123 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
124 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
125 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
126 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
128 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
129 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
130 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
131 ; GFX9-NEXT: s_waitcnt vmcnt(1)
132 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
133 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4
134 ; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4
135 ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4
136 ; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
137 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
138 ; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4
139 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
140 ; GFX9-NEXT: s_waitcnt vmcnt(0)
141 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
142 ; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4
143 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
144 ; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4
145 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
146 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
147 ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4
148 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
149 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
150 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16
151 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
152 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
153 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
155 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
156 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
157 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
158 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
159 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10
160 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
161 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
162 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
163 ; GFX9-NEXT: s_endpgm
165 ; GFX9-DL-LABEL: udot8_acc32:
166 ; GFX9-DL: ; %bb.0: ; %entry
167 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
168 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
169 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
170 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
171 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
172 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
173 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
174 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
175 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
176 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
177 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
178 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
179 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
180 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
181 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
182 ; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
183 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
184 ; GFX9-DL-NEXT: s_endpgm
186 ; GFX10-DL-LABEL: udot8_acc32:
187 ; GFX10-DL: ; %bb.0: ; %entry
188 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
189 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
190 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
191 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
192 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
193 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
194 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
195 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
196 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
197 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX10-DL-NEXT: s_clause 0x1
199 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
200 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
201 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
202 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
203 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
204 ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
205 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
206 ; GFX10-DL-NEXT: s_endpgm
207 <8 x i4> addrspace(1)* %src2,
208 i32 addrspace(1)* nocapture %dst) {
210 %idx = call i32 @llvm.amdgcn.workitem.id.x()
211 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
212 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
213 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
214 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
216 %v1e0 = extractelement <8 x i4> %vec1, i64 0
217 %cv1e0 = zext i4 %v1e0 to i32
218 %v2e0 = extractelement <8 x i4> %vec2, i64 0
219 %cv2e0 = zext i4 %v2e0 to i32
220 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
222 %v1e1 = extractelement <8 x i4> %vec1, i64 1
223 %cv1e1 = zext i4 %v1e1 to i32
224 %v2e1 = extractelement <8 x i4> %vec2, i64 1
225 %cv2e1 = zext i4 %v2e1 to i32
226 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
228 %v1e2 = extractelement <8 x i4> %vec1, i64 2
229 %cv1e2 = zext i4 %v1e2 to i32
230 %v2e2 = extractelement <8 x i4> %vec2, i64 2
231 %cv2e2 = zext i4 %v2e2 to i32
232 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
234 %v1e3 = extractelement <8 x i4> %vec1, i64 3
235 %cv1e3 = zext i4 %v1e3 to i32
236 %v2e3 = extractelement <8 x i4> %vec2, i64 3
237 %cv2e3 = zext i4 %v2e3 to i32
238 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
240 %v1e4 = extractelement <8 x i4> %vec1, i64 4
241 %cv1e4 = zext i4 %v1e4 to i32
242 %v2e4 = extractelement <8 x i4> %vec2, i64 4
243 %cv2e4 = zext i4 %v2e4 to i32
244 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
246 %v1e5 = extractelement <8 x i4> %vec1, i64 5
247 %cv1e5 = zext i4 %v1e5 to i32
248 %v2e5 = extractelement <8 x i4> %vec2, i64 5
249 %cv2e5 = zext i4 %v2e5 to i32
250 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
252 %v1e6 = extractelement <8 x i4> %vec1, i64 6
253 %cv1e6 = zext i4 %v1e6 to i32
254 %v2e6 = extractelement <8 x i4> %vec2, i64 6
255 %cv2e6 = zext i4 %v2e6 to i32
256 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
258 %v1e7 = extractelement <8 x i4> %vec1, i64 7
259 %cv1e7 = zext i4 %v1e7 to i32
260 %v2e7 = extractelement <8 x i4> %vec2, i64 7
261 %cv2e7 = zext i4 %v2e7 to i32
262 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
264 %acc = load i32, i32 addrspace(1)* %dst, align 4
265 %add1 = add i32 %mul0, %acc
266 %add2 = add i32 %add1, %mul1
267 %add3 = add i32 %add2, %mul2
268 %add4 = add i32 %add3, %mul3
269 %add5 = add i32 %add4, %mul4
270 %add6 = add i32 %add5, %mul5
271 %add7 = add i32 %add6, %mul6
272 %add8 = add i32 %add7, %mul7
274 store i32 %add8, i32 addrspace(1)* %dst, align 4
278 ; TODO: Remove the unnecessary instruction(that is zero-extending the
279 ; 2nd MAD) to have the pattern-recognizer to kick in.
280 define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
281 ; GFX7-LABEL: udot8_acc16:
282 ; GFX7: ; %bb.0: ; %entry
283 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
284 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
285 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
286 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
287 ; GFX7-NEXT: s_mov_b32 s14, -1
288 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
289 ; GFX7-NEXT: s_add_u32 s12, s12, s3
290 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
291 ; GFX7-NEXT: s_mov_b32 s10, 0
292 ; GFX7-NEXT: s_mov_b32 s11, s3
293 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
295 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
296 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
297 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
298 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
299 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
300 ; GFX7-NEXT: s_mov_b32 s2, -1
301 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
302 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
303 ; GFX7-NEXT: s_waitcnt vmcnt(2)
304 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
305 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
306 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
307 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
308 ; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
309 ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
310 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
311 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
312 ; GFX7-NEXT: s_waitcnt vmcnt(1)
313 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
314 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
315 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
316 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
317 ; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
318 ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
319 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
320 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
321 ; GFX7-NEXT: s_waitcnt vmcnt(0)
322 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
323 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
324 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
325 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
326 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
327 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
328 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
329 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
330 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
331 ; GFX7-NEXT: s_endpgm
333 ; GFX8-LABEL: udot8_acc16:
334 ; GFX8: ; %bb.0: ; %entry
335 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
336 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
337 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
338 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
339 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
340 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
342 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
343 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
344 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
345 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
346 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
347 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
348 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
349 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
350 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
351 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
352 ; GFX8-NEXT: s_mov_b32 s10, -1
353 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
354 ; GFX8-NEXT: s_add_u32 s8, s8, s3
355 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
356 ; GFX8-NEXT: s_waitcnt vmcnt(2)
357 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
358 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
359 ; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
360 ; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
361 ; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
362 ; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
363 ; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
364 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
365 ; GFX8-NEXT: s_waitcnt vmcnt(1)
366 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
367 ; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
368 ; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
369 ; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
370 ; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
371 ; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
372 ; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
373 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
374 ; GFX8-NEXT: s_waitcnt vmcnt(0)
375 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
376 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
377 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
378 ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
379 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
380 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
381 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
382 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
383 ; GFX8-NEXT: flat_store_short v[0:1], v2
384 ; GFX8-NEXT: s_endpgm
386 ; GFX9-LABEL: udot8_acc16:
387 ; GFX9: ; %bb.0: ; %entry
388 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
389 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
390 ; GFX9-NEXT: s_mov_b32 s10, -1
391 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
392 ; GFX9-NEXT: s_add_u32 s8, s8, s3
393 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
394 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
395 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
396 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
397 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
399 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
401 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
402 ; GFX9-NEXT: s_waitcnt vmcnt(2)
403 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
404 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
405 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
406 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
407 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
408 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
409 ; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
410 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
411 ; GFX9-NEXT: s_waitcnt vmcnt(1)
412 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
413 ; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
414 ; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
415 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
416 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
417 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
418 ; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
419 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
421 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
422 ; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
423 ; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
424 ; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
425 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
426 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
427 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
428 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
429 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
430 ; GFX9-NEXT: s_endpgm
432 ; GFX9-DL-LABEL: udot8_acc16:
433 ; GFX9-DL: ; %bb.0: ; %entry
434 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
435 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
436 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
437 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
438 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
439 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
440 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
441 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
442 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
443 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
445 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
446 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
447 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
448 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
449 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
450 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
451 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
452 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
453 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
454 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
455 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
456 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
457 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
458 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
459 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
460 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
461 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
462 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
463 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
464 ; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
465 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
466 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
467 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
468 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
469 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
470 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
471 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
472 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
473 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
474 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
475 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
476 ; GFX9-DL-NEXT: s_endpgm
478 ; GFX10-DL-LABEL: udot8_acc16:
479 ; GFX10-DL: ; %bb.0: ; %entry
480 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
481 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
482 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
483 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
484 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
485 ; GFX10-DL-NEXT: s_clause 0x1
486 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
487 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
488 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
489 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
490 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
491 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX10-DL-NEXT: s_clause 0x1
493 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
494 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
495 ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3]
496 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
497 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
498 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
499 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
500 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
501 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
502 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
503 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
504 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
505 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
506 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
507 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
508 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
509 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
510 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
511 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
512 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
513 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
514 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
515 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
516 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
517 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
518 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
519 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
520 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
521 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
522 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
523 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3]
524 ; GFX10-DL-NEXT: s_endpgm
525 <8 x i4> addrspace(1)* %src2,
526 i16 addrspace(1)* nocapture %dst) {
528 %idx = call i32 @llvm.amdgcn.workitem.id.x()
529 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
530 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
531 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
532 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
534 %v1e0 = extractelement <8 x i4> %vec1, i64 0
535 %cv1e0 = zext i4 %v1e0 to i16
536 %v2e0 = extractelement <8 x i4> %vec2, i64 0
537 %cv2e0 = zext i4 %v2e0 to i16
538 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
540 %v1e1 = extractelement <8 x i4> %vec1, i64 1
541 %cv1e1 = zext i4 %v1e1 to i16
542 %v2e1 = extractelement <8 x i4> %vec2, i64 1
543 %cv2e1 = zext i4 %v2e1 to i16
544 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
546 %v1e2 = extractelement <8 x i4> %vec1, i64 2
547 %cv1e2 = zext i4 %v1e2 to i16
548 %v2e2 = extractelement <8 x i4> %vec2, i64 2
549 %cv2e2 = zext i4 %v2e2 to i16
550 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
552 %v1e3 = extractelement <8 x i4> %vec1, i64 3
553 %cv1e3 = zext i4 %v1e3 to i16
554 %v2e3 = extractelement <8 x i4> %vec2, i64 3
555 %cv2e3 = zext i4 %v2e3 to i16
556 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
558 %v1e4 = extractelement <8 x i4> %vec1, i64 4
559 %cv1e4 = zext i4 %v1e4 to i16
560 %v2e4 = extractelement <8 x i4> %vec2, i64 4
561 %cv2e4 = zext i4 %v2e4 to i16
562 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
564 %v1e5 = extractelement <8 x i4> %vec1, i64 5
565 %cv1e5 = zext i4 %v1e5 to i16
566 %v2e5 = extractelement <8 x i4> %vec2, i64 5
567 %cv2e5 = zext i4 %v2e5 to i16
568 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
570 %v1e6 = extractelement <8 x i4> %vec1, i64 6
571 %cv1e6 = zext i4 %v1e6 to i16
572 %v2e6 = extractelement <8 x i4> %vec2, i64 6
573 %cv2e6 = zext i4 %v2e6 to i16
574 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
576 %v1e7 = extractelement <8 x i4> %vec1, i64 7
577 %cv1e7 = zext i4 %v1e7 to i16
578 %v2e7 = extractelement <8 x i4> %vec2, i64 7
579 %cv2e7 = zext i4 %v2e7 to i16
580 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
582 %acc = load i16, i16 addrspace(1)* %dst, align 4
583 %add1 = add i16 %mul0, %acc
584 %add2 = add i16 %add1, %mul1
585 %add3 = add i16 %add2, %mul2
586 %add4 = add i16 %add3, %mul3
587 %add5 = add i16 %add4, %mul4
588 %add6 = add i16 %add5, %mul5
589 %add7 = add i16 %add6, %mul6
590 %add8 = add i16 %add7, %mul7
592 store i16 %add8, i16 addrspace(1)* %dst, align 4
596 ; TODO: Remove the unnecessary instruction(that is zero-extending the
597 ; 2nd MAD) to have the pattern-recognizer to kick in.
598 define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
599 ; GFX7-LABEL: udot8_acc8:
600 ; GFX7: ; %bb.0: ; %entry
601 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
602 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
603 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
604 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
605 ; GFX7-NEXT: s_mov_b32 s14, -1
606 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
607 ; GFX7-NEXT: s_add_u32 s12, s12, s3
608 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
609 ; GFX7-NEXT: s_mov_b32 s10, 0
610 ; GFX7-NEXT: s_mov_b32 s11, s3
611 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
612 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
613 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
614 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
615 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
616 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
617 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
618 ; GFX7-NEXT: s_mov_b32 s2, -1
619 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
620 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
621 ; GFX7-NEXT: s_waitcnt vmcnt(2)
622 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
623 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
624 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
625 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
626 ; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
627 ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
628 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
629 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
630 ; GFX7-NEXT: s_waitcnt vmcnt(1)
631 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
632 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
633 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
634 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
635 ; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
636 ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
637 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
638 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
639 ; GFX7-NEXT: s_waitcnt vmcnt(0)
640 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
641 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
642 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
643 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
644 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
645 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
646 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
647 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
648 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
649 ; GFX7-NEXT: s_endpgm
651 ; GFX8-LABEL: udot8_acc8:
652 ; GFX8: ; %bb.0: ; %entry
653 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
654 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
655 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
656 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
657 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
658 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
659 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
660 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
661 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
662 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
663 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
664 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
665 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
666 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
667 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
668 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
669 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
670 ; GFX8-NEXT: s_mov_b32 s10, -1
671 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
672 ; GFX8-NEXT: s_add_u32 s8, s8, s3
673 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
674 ; GFX8-NEXT: s_waitcnt vmcnt(2)
675 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
676 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
677 ; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
678 ; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
679 ; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
680 ; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
681 ; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
682 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
683 ; GFX8-NEXT: s_waitcnt vmcnt(1)
684 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
685 ; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
686 ; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
687 ; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
688 ; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
689 ; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
690 ; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
691 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
692 ; GFX8-NEXT: s_waitcnt vmcnt(0)
693 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
694 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
695 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
696 ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
697 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
698 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
699 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
700 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
701 ; GFX8-NEXT: flat_store_byte v[0:1], v2
702 ; GFX8-NEXT: s_endpgm
704 ; GFX9-LABEL: udot8_acc8:
705 ; GFX9: ; %bb.0: ; %entry
706 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
707 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
708 ; GFX9-NEXT: s_mov_b32 s10, -1
709 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
710 ; GFX9-NEXT: s_add_u32 s8, s8, s3
711 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
712 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
713 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
714 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
715 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
716 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
717 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
718 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
719 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
720 ; GFX9-NEXT: s_waitcnt vmcnt(2)
721 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
722 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
723 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
724 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
725 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
726 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
727 ; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
728 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
729 ; GFX9-NEXT: s_waitcnt vmcnt(1)
730 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
731 ; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
732 ; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
733 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
734 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
735 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
736 ; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
737 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
739 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
740 ; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
741 ; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
742 ; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
743 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
744 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
745 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
746 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
747 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
748 ; GFX9-NEXT: s_endpgm
750 ; GFX9-DL-LABEL: udot8_acc8:
751 ; GFX9-DL: ; %bb.0: ; %entry
752 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
753 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
754 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
755 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
756 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
757 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
758 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
759 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
760 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
761 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
762 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
763 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
764 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
765 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
766 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
767 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
768 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
769 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
770 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
771 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
772 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
773 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
774 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
775 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
776 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
777 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
778 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
779 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
780 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
781 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
782 ; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
783 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
784 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
785 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
786 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
787 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
788 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
789 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
790 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
791 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
792 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
793 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
794 ; GFX9-DL-NEXT: s_endpgm
796 ; GFX10-DL-LABEL: udot8_acc8:
797 ; GFX10-DL: ; %bb.0: ; %entry
798 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
799 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
800 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
801 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
802 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
803 ; GFX10-DL-NEXT: s_clause 0x1
804 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
805 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
806 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
807 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
808 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
809 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
810 ; GFX10-DL-NEXT: s_clause 0x1
811 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
812 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
813 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
814 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
815 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
816 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
817 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
818 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
819 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
820 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
821 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
822 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
823 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
824 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
825 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
826 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
827 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
828 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
829 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
830 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
831 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
832 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
833 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
834 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
835 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
836 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
837 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
838 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
839 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
840 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
841 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
842 ; GFX10-DL-NEXT: s_endpgm
843 <8 x i4> addrspace(1)* %src2,
844 i8 addrspace(1)* nocapture %dst) {
846 %idx = call i32 @llvm.amdgcn.workitem.id.x()
847 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
848 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
849 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
850 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
852 %v1e0 = extractelement <8 x i4> %vec1, i64 0
853 %cv1e0 = zext i4 %v1e0 to i8
854 %v2e0 = extractelement <8 x i4> %vec2, i64 0
855 %cv2e0 = zext i4 %v2e0 to i8
856 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
858 %v1e1 = extractelement <8 x i4> %vec1, i64 1
859 %cv1e1 = zext i4 %v1e1 to i8
860 %v2e1 = extractelement <8 x i4> %vec2, i64 1
861 %cv2e1 = zext i4 %v2e1 to i8
862 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
864 %v1e2 = extractelement <8 x i4> %vec1, i64 2
865 %cv1e2 = zext i4 %v1e2 to i8
866 %v2e2 = extractelement <8 x i4> %vec2, i64 2
867 %cv2e2 = zext i4 %v2e2 to i8
868 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
870 %v1e3 = extractelement <8 x i4> %vec1, i64 3
871 %cv1e3 = zext i4 %v1e3 to i8
872 %v2e3 = extractelement <8 x i4> %vec2, i64 3
873 %cv2e3 = zext i4 %v2e3 to i8
874 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
876 %v1e4 = extractelement <8 x i4> %vec1, i64 4
877 %cv1e4 = zext i4 %v1e4 to i8
878 %v2e4 = extractelement <8 x i4> %vec2, i64 4
879 %cv2e4 = zext i4 %v2e4 to i8
880 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
882 %v1e5 = extractelement <8 x i4> %vec1, i64 5
883 %cv1e5 = zext i4 %v1e5 to i8
884 %v2e5 = extractelement <8 x i4> %vec2, i64 5
885 %cv2e5 = zext i4 %v2e5 to i8
886 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
888 %v1e6 = extractelement <8 x i4> %vec1, i64 6
889 %cv1e6 = zext i4 %v1e6 to i8
890 %v2e6 = extractelement <8 x i4> %vec2, i64 6
891 %cv2e6 = zext i4 %v2e6 to i8
892 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
894 %v1e7 = extractelement <8 x i4> %vec1, i64 7
895 %cv1e7 = zext i4 %v1e7 to i8
896 %v2e7 = extractelement <8 x i4> %vec2, i64 7
897 %cv2e7 = zext i4 %v2e7 to i8
898 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
900 %acc = load i8, i8 addrspace(1)* %dst, align 4
901 %add1 = add i8 %mul0, %acc
902 %add2 = add i8 %add1, %mul1
903 %add3 = add i8 %add2, %mul2
904 %add4 = add i8 %add3, %mul3
905 %add5 = add i8 %add4, %mul4
906 %add6 = add i8 %add5, %mul5
907 %add7 = add i8 %add6, %mul6
908 %add8 = add i8 %add7, %mul7
910 store i8 %add8, i8 addrspace(1)* %dst, align 4
914 ; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
915 ; to have the pattern-recognizer to kick in.
916 define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
917 ; GFX7-LABEL: udot8_acc4:
918 ; GFX7: ; %bb.0: ; %entry
919 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
920 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
921 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
922 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
923 ; GFX7-NEXT: s_mov_b32 s14, -1
924 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
925 ; GFX7-NEXT: s_add_u32 s12, s12, s3
926 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
927 ; GFX7-NEXT: s_mov_b32 s10, 0
928 ; GFX7-NEXT: s_mov_b32 s11, s3
929 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
930 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
931 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
932 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
933 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
934 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
935 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
936 ; GFX7-NEXT: s_mov_b32 s2, -1
937 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
938 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
939 ; GFX7-NEXT: s_waitcnt vmcnt(2)
940 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
941 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
942 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
943 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
944 ; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
945 ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
946 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
947 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
948 ; GFX7-NEXT: s_waitcnt vmcnt(1)
949 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
950 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
951 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
952 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
953 ; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
954 ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
955 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
956 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
957 ; GFX7-NEXT: s_waitcnt vmcnt(0)
958 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
959 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
960 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
961 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
962 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
963 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
964 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
965 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
966 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
967 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
968 ; GFX7-NEXT: s_endpgm
970 ; GFX8-LABEL: udot8_acc4:
971 ; GFX8: ; %bb.0: ; %entry
972 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
973 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
974 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
975 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
976 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
977 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
978 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
979 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
980 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
981 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
982 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
983 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
984 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
985 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
986 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
987 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
988 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
989 ; GFX8-NEXT: s_mov_b32 s10, -1
990 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
991 ; GFX8-NEXT: s_add_u32 s8, s8, s3
992 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
993 ; GFX8-NEXT: s_waitcnt vmcnt(2)
994 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
995 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
996 ; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
997 ; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
998 ; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
999 ; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
1000 ; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
1001 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
1002 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1003 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
1004 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
1005 ; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
1006 ; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
1007 ; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
1008 ; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
1009 ; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
1010 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
1011 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1012 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
1013 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
1014 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
1015 ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
1016 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
1017 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
1018 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
1019 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
1020 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
1021 ; GFX8-NEXT: flat_store_byte v[0:1], v2
1022 ; GFX8-NEXT: s_endpgm
1024 ; GFX9-LABEL: udot8_acc4:
1025 ; GFX9: ; %bb.0: ; %entry
1026 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1027 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1028 ; GFX9-NEXT: s_mov_b32 s10, -1
1029 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1030 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1031 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1032 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1033 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1034 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1035 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1036 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1037 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1038 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1039 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
1040 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1041 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1042 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1043 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
1044 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
1045 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
1046 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
1047 ; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
1048 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1049 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1050 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1051 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2
1052 ; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
1053 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
1054 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
1055 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
1056 ; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
1057 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
1058 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1059 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1060 ; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
1061 ; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
1062 ; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
1063 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
1064 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
1065 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
1066 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
1067 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1068 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
1069 ; GFX9-NEXT: s_endpgm
1071 ; GFX9-DL-LABEL: udot8_acc4:
1072 ; GFX9-DL: ; %bb.0: ; %entry
1073 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1074 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1075 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1076 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1077 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1078 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1079 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1080 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1081 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1082 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1083 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1084 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1085 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1086 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
1087 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1088 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1089 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1090 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
1091 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
1092 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
1093 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
1094 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
1095 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
1096 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1097 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1098 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2
1099 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
1100 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
1101 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
1102 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
1103 ; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
1104 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
1105 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1106 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1107 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
1108 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
1109 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
1110 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
1111 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
1112 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
1113 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
1114 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
1115 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
1116 ; GFX9-DL-NEXT: s_endpgm
1118 ; GFX10-DL-LABEL: udot8_acc4:
1119 ; GFX10-DL: ; %bb.0: ; %entry
1120 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1121 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1122 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1123 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1124 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1125 ; GFX10-DL-NEXT: s_clause 0x1
1126 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1127 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1128 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1129 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1130 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1131 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1132 ; GFX10-DL-NEXT: s_clause 0x1
1133 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
1134 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
1135 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
1136 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1137 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
1138 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1139 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
1140 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
1141 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
1142 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1143 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
1144 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
1145 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
1146 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1147 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
1148 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
1149 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1150 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
1151 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
1152 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1153 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
1154 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
1155 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1156 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1157 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
1158 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
1159 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
1160 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1161 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1162 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
1163 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
1164 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
1165 ; GFX10-DL-NEXT: s_endpgm
1166 <8 x i4> addrspace(1)* %src2,
1167 i4 addrspace(1)* nocapture %dst) {
1169 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1170 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1171 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1172 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1173 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1175 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1176 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1177 %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1179 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1180 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1181 %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1183 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1184 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1185 %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1187 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1188 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1189 %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1191 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1192 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1193 %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1195 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1196 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1197 %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1199 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1200 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1201 %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1203 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1204 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1205 %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1207 %acc = load i4, i4 addrspace(1)* %dst, align 4
1208 %add1 = add i4 %mul0, %acc
1209 %add2 = add i4 %add1, %mul1
1210 %add3 = add i4 %add2, %mul2
1211 %add4 = add i4 %add3, %mul3
1212 %add5 = add i4 %add4, %mul4
1213 %add6 = add i4 %add5, %mul5
1214 %add7 = add i4 %add6, %mul6
1215 %add8 = add i4 %add7, %mul7
1217 store i4 %add8, i4 addrspace(1)* %dst, align 4
1221 ; TODO: Currently, permutation of udot8 is turned off due to a huge increase
1222 ; in the compile time.
1223 define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
1224 ; GFX7-LABEL: udot8_CommutationInsideMAD:
1225 ; GFX7: ; %bb.0: ; %entry
1226 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1227 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1228 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1229 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1230 ; GFX7-NEXT: s_mov_b32 s14, -1
1231 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1232 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1233 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1234 ; GFX7-NEXT: s_mov_b32 s10, 0
1235 ; GFX7-NEXT: s_mov_b32 s11, s3
1236 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1238 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1239 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1240 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1241 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1242 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1243 ; GFX7-NEXT: s_mov_b32 s2, -1
1244 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
1245 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1246 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1247 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
1248 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
1249 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
1250 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
1251 ; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
1252 ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
1253 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
1254 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
1255 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1256 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
1257 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
1258 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
1259 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
1260 ; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
1261 ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
1262 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
1263 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1264 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1265 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1266 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
1267 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
1268 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
1269 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
1270 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
1271 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
1272 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
1273 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1274 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
1275 ; GFX7-NEXT: s_endpgm
1277 ; GFX8-LABEL: udot8_CommutationInsideMAD:
1278 ; GFX8: ; %bb.0: ; %entry
1279 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1280 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1281 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1282 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1283 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1284 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1285 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1286 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1287 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1289 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1290 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1291 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1292 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
1293 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1294 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1295 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
1296 ; GFX8-NEXT: s_mov_b32 s10, -1
1297 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1298 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1299 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1300 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1301 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
1302 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
1303 ; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
1304 ; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
1305 ; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
1306 ; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
1307 ; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
1308 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
1309 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1310 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
1311 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
1312 ; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
1313 ; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
1314 ; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
1315 ; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
1316 ; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
1317 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
1318 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1319 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
1320 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
1321 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
1322 ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
1323 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
1324 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
1325 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
1326 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
1327 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
1328 ; GFX8-NEXT: flat_store_byte v[0:1], v2
1329 ; GFX8-NEXT: s_endpgm
1331 ; GFX9-LABEL: udot8_CommutationInsideMAD:
1332 ; GFX9: ; %bb.0: ; %entry
1333 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1334 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1335 ; GFX9-NEXT: s_mov_b32 s10, -1
1336 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1337 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1338 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1339 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1340 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1341 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1342 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1343 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1344 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1345 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1346 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
1347 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1348 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1349 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1350 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
1351 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
1352 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
1353 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
1354 ; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
1355 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1356 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1357 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1358 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2
1359 ; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
1360 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
1361 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
1362 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
1363 ; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
1364 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
1365 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1366 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1367 ; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
1368 ; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
1369 ; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
1370 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
1371 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
1372 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
1373 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
1374 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1375 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
1376 ; GFX9-NEXT: s_endpgm
1378 ; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
1379 ; GFX9-DL: ; %bb.0: ; %entry
1380 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1381 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1382 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1383 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1384 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1385 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1386 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1387 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1388 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1389 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1390 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1391 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1392 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1393 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
1394 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1395 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1396 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1397 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
1398 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
1399 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
1400 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
1401 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
1402 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
1403 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1404 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1405 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2
1406 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
1407 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
1408 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
1409 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
1410 ; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
1411 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
1412 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1413 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1414 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
1415 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
1416 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
1417 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
1418 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
1419 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
1420 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
1421 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
1422 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
1423 ; GFX9-DL-NEXT: s_endpgm
1425 ; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1426 ; GFX10-DL: ; %bb.0: ; %entry
1427 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1428 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1429 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1430 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1431 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1432 ; GFX10-DL-NEXT: s_clause 0x1
1433 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1434 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1435 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1436 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1437 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1438 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1439 ; GFX10-DL-NEXT: s_clause 0x1
1440 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
1441 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
1442 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
1443 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1444 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
1445 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1446 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
1447 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
1448 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
1449 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1450 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
1451 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
1452 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
1453 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1454 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
1455 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
1456 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1457 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
1458 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
1459 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1460 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
1461 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
1462 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1463 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1464 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
1465 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
1466 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
1467 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1468 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1469 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
1470 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
1471 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
1472 ; GFX10-DL-NEXT: s_endpgm
1473 <8 x i4> addrspace(1)* %src2,
1474 i4 addrspace(1)* nocapture %dst) {
1476 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1477 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1478 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1479 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1480 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1482 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1483 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1484 %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1486 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1487 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1488 %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1490 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1491 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1492 %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1494 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1495 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1496 %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1498 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1499 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1500 %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1502 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1503 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1504 %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1506 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1507 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1508 %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1510 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1511 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1512 %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1514 %acc = load i4, i4 addrspace(1)* %dst, align 4
1515 %add1 = add i4 %mul0, %acc
1516 %add2 = add i4 %mul1, %add1
1517 %add3 = add i4 %mul2, %add2
1518 %add4 = add i4 %mul3, %add3
1519 %add5 = add i4 %mul4, %add4
1520 %add6 = add i4 %mul5, %add5
1521 %add7 = add i4 %mul6, %add6
1522 %add8 = add i4 %mul7, %add7
1524 store i4 %add8, i4 addrspace(1)* %dst, align 4
1528 define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1529 ; GFX7-LABEL: udot8_multiuses_mul1:
1530 ; GFX7: ; %bb.0: ; %entry
1531 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1532 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1533 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1534 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1535 ; GFX7-NEXT: s_mov_b32 s14, -1
1536 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1537 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1538 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1539 ; GFX7-NEXT: s_mov_b32 s10, 0
1540 ; GFX7-NEXT: s_mov_b32 s11, s3
1541 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1542 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1543 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1544 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1545 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1546 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1547 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1548 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1549 ; GFX7-NEXT: s_mov_b32 s2, -1
1550 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1551 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1552 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
1553 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
1554 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
1555 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
1556 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
1557 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
1558 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
1559 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
1560 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1561 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1562 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
1563 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
1564 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
1565 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
1566 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
1567 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
1568 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1569 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1570 ; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4
1571 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
1572 ; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16
1573 ; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2
1574 ; GFX7-NEXT: v_mad_u32_u24 v2, v6, v13, v2
1575 ; GFX7-NEXT: v_mad_u32_u24 v2, v5, v12, v2
1576 ; GFX7-NEXT: v_mad_u32_u24 v2, v4, v11, v2
1577 ; GFX7-NEXT: v_mad_u32_u24 v2, v3, v10, v2
1578 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v2
1579 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1580 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1581 ; GFX7-NEXT: s_endpgm
1583 ; GFX8-LABEL: udot8_multiuses_mul1:
1584 ; GFX8: ; %bb.0: ; %entry
1585 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1586 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1587 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1588 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1589 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1590 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1591 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1592 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1593 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1595 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1596 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1597 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1598 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1599 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1600 ; GFX8-NEXT: s_mov_b32 s10, -1
1601 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1602 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1603 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1604 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1605 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
1606 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
1607 ; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4
1608 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4
1609 ; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4
1610 ; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4
1611 ; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4
1612 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
1613 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1614 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1615 ; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4
1616 ; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4
1617 ; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4
1618 ; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4
1619 ; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4
1620 ; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
1621 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1622 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1623 ; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s2
1624 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16
1625 ; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16
1626 ; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3
1627 ; GFX8-NEXT: v_mad_u32_u24 v3, v6, v13, v3
1628 ; GFX8-NEXT: v_mad_u32_u24 v3, v5, v12, v3
1629 ; GFX8-NEXT: v_mad_u32_u24 v3, v4, v11, v3
1630 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3
1631 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2
1632 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
1633 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1634 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1635 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1636 ; GFX8-NEXT: s_endpgm
1638 ; GFX9-LABEL: udot8_multiuses_mul1:
1639 ; GFX9: ; %bb.0: ; %entry
1640 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1641 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1642 ; GFX9-NEXT: s_mov_b32 s10, -1
1643 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1644 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1645 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1646 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1647 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1648 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1650 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1651 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1652 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1653 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1654 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1655 ; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4
1656 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1657 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
1658 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
1659 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
1660 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
1661 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
1662 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1663 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1664 ; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4
1665 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1666 ; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
1667 ; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
1668 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
1669 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
1670 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
1671 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
1672 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2
1673 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1674 ; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0
1675 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16
1676 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
1677 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1
1678 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
1679 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
1680 ; GFX9-NEXT: v_add3_u32 v2, v2, v9, v8
1681 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
1682 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
1683 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6
1684 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4
1685 ; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2
1686 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
1687 ; GFX9-NEXT: s_endpgm
1689 ; GFX9-DL-LABEL: udot8_multiuses_mul1:
1690 ; GFX9-DL: ; %bb.0: ; %entry
1691 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1692 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1693 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1694 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1695 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1696 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1697 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1698 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1699 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1700 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1701 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1702 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1703 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1704 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1705 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1706 ; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4
1707 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1708 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
1709 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
1710 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
1711 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
1712 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
1713 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
1714 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1715 ; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
1716 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1717 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
1718 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
1719 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
1720 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
1721 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
1722 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
1723 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2
1724 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1725 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
1726 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16
1727 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15
1728 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1
1729 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14
1730 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13
1731 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v9, v8
1732 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12
1733 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11
1734 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6
1735 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4
1736 ; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2
1737 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1738 ; GFX9-DL-NEXT: s_endpgm
1740 ; GFX10-DL-LABEL: udot8_multiuses_mul1:
1741 ; GFX10-DL: ; %bb.0: ; %entry
1742 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1743 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1744 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1745 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1746 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1747 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1748 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1749 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1750 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1751 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1752 ; GFX10-DL-NEXT: s_clause 0x1
1753 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1754 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1755 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1756 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1757 ; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1
1758 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1759 ; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2
1760 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
1761 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1
1762 ; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4
1763 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4
1764 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4
1765 ; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
1766 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4
1767 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
1768 ; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
1769 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
1770 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1771 ; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2
1772 ; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4
1773 ; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
1774 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11
1775 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12
1776 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13
1777 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1778 ; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
1779 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15
1780 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14
1781 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7
1782 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2
1783 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10
1784 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9
1785 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5
1786 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
1787 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1788 ; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
1789 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
1790 ; GFX10-DL-NEXT: s_endpgm
1791 <8 x i4> addrspace(1)* %src2,
1792 i32 addrspace(1)* nocapture %dst) {
1794 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1795 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1796 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1797 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1798 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1800 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1801 %cv1e0 = zext i4 %v1e0 to i32
1802 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1803 %cv2e0 = zext i4 %v2e0 to i32
1804 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1806 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1807 %cv1e1 = zext i4 %v1e1 to i32
1808 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1809 %cv2e1 = zext i4 %v2e1 to i32
1810 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1812 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1813 %cv1e2 = zext i4 %v1e2 to i32
1814 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1815 %cv2e2 = zext i4 %v2e2 to i32
1816 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1818 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1819 %cv1e3 = zext i4 %v1e3 to i32
1820 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1821 %cv2e3 = zext i4 %v2e3 to i32
1822 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1824 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1825 %cv1e4 = zext i4 %v1e4 to i32
1826 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1827 %cv2e4 = zext i4 %v2e4 to i32
1828 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1830 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1831 %cv1e5 = zext i4 %v1e5 to i32
1832 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1833 %cv2e5 = zext i4 %v2e5 to i32
1834 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1836 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1837 %cv1e6 = zext i4 %v1e6 to i32
1838 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1839 %cv2e6 = zext i4 %v2e6 to i32
1840 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1842 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1843 %cv1e7 = zext i4 %v1e7 to i32
1844 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1845 %cv2e7 = zext i4 %v2e7 to i32
1846 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1848 %acc = load i32, i32 addrspace(1)* %dst, align 4
1849 %add1 = add i32 %mul0, %acc
1850 %add = add i32 %mul0, %add1
1851 %add2 = add i32 %add1, %mul1
1852 %add3 = add i32 %add2, %mul2
1853 %add4 = add i32 %add3, %mul3
1854 %add5 = add i32 %add4, %mul4
1855 %add6 = add i32 %add5, %mul5
1856 %add7 = add i32 %add6, %mul6
1857 %add8 = add i32 %add7, %mul7
1859 %res = add i32 %add, %add8
1860 store i32 %res, i32 addrspace(1)* %dst, align 4
1864 define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1865 ; GFX7-LABEL: udot8_acc32_vecMul:
1866 ; GFX7: ; %bb.0: ; %entry
1867 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1868 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1869 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1870 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1871 ; GFX7-NEXT: s_mov_b32 s14, -1
1872 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1873 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1874 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1875 ; GFX7-NEXT: s_mov_b32 s10, 0
1876 ; GFX7-NEXT: s_mov_b32 s11, s3
1877 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1878 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1879 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1880 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1881 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1882 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1883 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1884 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1885 ; GFX7-NEXT: s_mov_b32 s2, -1
1886 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1887 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1888 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
1889 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
1890 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
1891 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
1892 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
1893 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
1894 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
1895 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
1896 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1897 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1898 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
1899 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
1900 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
1901 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
1902 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
1903 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
1904 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1905 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1906 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
1907 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
1908 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
1909 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
1910 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
1911 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
1912 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
1913 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
1914 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1915 ; GFX7-NEXT: s_endpgm
1917 ; GFX8-LABEL: udot8_acc32_vecMul:
1918 ; GFX8: ; %bb.0: ; %entry
1919 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1920 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1921 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1922 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1923 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1924 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1925 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1926 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1927 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1928 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1929 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1930 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1931 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1932 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1933 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1934 ; GFX8-NEXT: s_mov_b32 s10, -1
1935 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1936 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1937 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1938 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1939 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
1940 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
1941 ; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4
1942 ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4
1943 ; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4
1944 ; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4
1945 ; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4
1946 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
1947 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1948 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1949 ; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4
1950 ; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4
1951 ; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4
1952 ; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4
1953 ; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4
1954 ; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
1955 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1956 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1957 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
1958 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
1959 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
1960 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
1961 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0
1962 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
1963 ; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
1964 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
1965 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1966 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1967 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1968 ; GFX8-NEXT: s_endpgm
1970 ; GFX9-LABEL: udot8_acc32_vecMul:
1971 ; GFX9: ; %bb.0: ; %entry
1972 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1973 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1974 ; GFX9-NEXT: s_mov_b32 s10, -1
1975 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1976 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1977 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1978 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1979 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1980 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1981 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1983 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1984 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1985 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1986 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1987 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
1988 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4
1989 ; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4
1990 ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4
1991 ; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
1992 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
1993 ; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4
1994 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1995 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1996 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1997 ; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4
1998 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
1999 ; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4
2000 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
2001 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
2002 ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4
2003 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
2004 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
2005 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16
2006 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
2007 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
2008 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2009 ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
2010 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
2011 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
2012 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
2013 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
2014 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10
2015 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
2016 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
2017 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
2018 ; GFX9-NEXT: s_endpgm
2020 ; GFX9-DL-LABEL: udot8_acc32_vecMul:
2021 ; GFX9-DL: ; %bb.0: ; %entry
2022 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2023 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2024 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2025 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2026 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2027 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2028 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2029 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2030 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2031 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2032 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2033 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2034 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
2035 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2036 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2037 ; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
2038 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
2039 ; GFX9-DL-NEXT: s_endpgm
2041 ; GFX10-DL-LABEL: udot8_acc32_vecMul:
2042 ; GFX10-DL: ; %bb.0: ; %entry
2043 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2044 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2045 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2046 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2047 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2048 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2049 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2050 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2051 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2052 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2053 ; GFX10-DL-NEXT: s_clause 0x1
2054 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2055 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2056 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2057 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
2058 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2059 ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
2060 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
2061 ; GFX10-DL-NEXT: s_endpgm
2062 <8 x i4> addrspace(1)* %src2,
2063 i32 addrspace(1)* nocapture %dst) {
2065 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2066 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2067 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2068 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2069 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2071 %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
2072 %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
2074 %mul = mul <8 x i32> %cvec1, %cvec2
2075 %mul0 = extractelement <8 x i32> %mul, i64 0
2076 %mul1 = extractelement <8 x i32> %mul, i64 1
2077 %mul2 = extractelement <8 x i32> %mul, i64 2
2078 %mul3 = extractelement <8 x i32> %mul, i64 3
2079 %mul4 = extractelement <8 x i32> %mul, i64 4
2080 %mul5 = extractelement <8 x i32> %mul, i64 5
2081 %mul6 = extractelement <8 x i32> %mul, i64 6
2082 %mul7 = extractelement <8 x i32> %mul, i64 7
2084 %acc = load i32, i32 addrspace(1)* %dst, align 4
2085 %add1 = add i32 %mul0, %acc
2086 %add2 = add i32 %add1, %mul1
2087 %add3 = add i32 %add2, %mul2
2088 %add4 = add i32 %add3, %mul3
2089 %add5 = add i32 %add4, %mul4
2090 %add6 = add i32 %add5, %mul5
2091 %add7 = add i32 %add6, %mul6
2092 %add8 = add i32 %add7, %mul7
2094 store i32 %add8, i32 addrspace(1)* %dst, align 4
2098 ; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
2099 ; support the pattern.
2100 define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2101 ; GFX7-LABEL: udot8_acc16_vecMul:
2102 ; GFX7: ; %bb.0: ; %entry
2103 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2104 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2105 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2106 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2107 ; GFX7-NEXT: s_mov_b32 s14, -1
2108 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2109 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2110 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2111 ; GFX7-NEXT: s_mov_b32 s10, 0
2112 ; GFX7-NEXT: s_mov_b32 s11, s3
2113 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2114 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2115 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2116 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2117 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2118 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2119 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2120 ; GFX7-NEXT: s_mov_b32 s2, -1
2121 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
2122 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2123 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2124 ; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4
2125 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v2
2126 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
2127 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
2128 ; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4
2129 ; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
2130 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v2
2131 ; GFX7-NEXT: v_alignbit_b32 v2, v8, v2, 16
2132 ; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9
2133 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2134 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v0
2135 ; GFX7-NEXT: v_and_b32_e32 v14, 15, v0
2136 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
2137 ; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9
2138 ; GFX7-NEXT: v_or_b32_e32 v8, v14, v8
2139 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v7
2140 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v7
2141 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v8
2142 ; GFX7-NEXT: v_and_b32_e32 v8, 15, v8
2143 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2144 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v8, v1
2145 ; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
2146 ; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4
2147 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1
2148 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
2149 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
2150 ; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
2151 ; GFX7-NEXT: v_alignbit_b32 v0, v15, v0, 16
2152 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
2153 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v2
2154 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
2155 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0
2156 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2157 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
2158 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2159 ; GFX7-NEXT: v_mad_u32_u24 v0, v16, v15, v0
2160 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2161 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2162 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2163 ; GFX7-NEXT: s_endpgm
2165 ; GFX8-LABEL: udot8_acc16_vecMul:
2166 ; GFX8: ; %bb.0: ; %entry
2167 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2168 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2169 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2170 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2171 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2172 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2173 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2174 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2175 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2176 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2177 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2178 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2179 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2180 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2181 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2182 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2183 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
2184 ; GFX8-NEXT: s_mov_b32 s10, -1
2185 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2186 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2187 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2188 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2189 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
2190 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
2191 ; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
2192 ; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
2193 ; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
2194 ; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
2195 ; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
2196 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
2197 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2198 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
2199 ; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
2200 ; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
2201 ; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
2202 ; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
2203 ; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
2204 ; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
2205 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
2206 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2207 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2208 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
2209 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
2210 ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
2211 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
2212 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
2213 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
2214 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
2215 ; GFX8-NEXT: flat_store_short v[0:1], v2
2216 ; GFX8-NEXT: s_endpgm
2218 ; GFX9-LABEL: udot8_acc16_vecMul:
2219 ; GFX9: ; %bb.0: ; %entry
2220 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2221 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2222 ; GFX9-NEXT: s_mov_b32 s10, -1
2223 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2224 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2225 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2226 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2227 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2228 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2229 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2230 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2231 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2232 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2233 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
2234 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2235 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
2236 ; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
2237 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2238 ; GFX9-NEXT: v_and_b32_e32 v12, 15, v2
2239 ; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4
2240 ; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
2241 ; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4
2242 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7
2243 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12
2244 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5
2245 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4
2246 ; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4
2247 ; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2248 ; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12
2249 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2250 ; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4
2251 ; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4
2252 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9
2253 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
2254 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7
2255 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1
2256 ; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4
2257 ; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4
2258 ; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4
2259 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2
2260 ; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4
2261 ; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9
2262 ; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14
2263 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2264 ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
2265 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
2266 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
2267 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16
2268 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9
2269 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2270 ; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
2271 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1
2272 ; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16
2273 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
2274 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2275 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10
2276 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2277 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
2278 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2279 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
2280 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2281 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
2282 ; GFX9-NEXT: s_endpgm
2284 ; GFX9-DL-LABEL: udot8_acc16_vecMul:
2285 ; GFX9-DL: ; %bb.0: ; %entry
2286 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2287 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2288 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2289 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2290 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2291 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2292 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2293 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2294 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2295 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2296 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2297 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2298 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2299 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
2300 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2301 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
2302 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
2303 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2304 ; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2
2305 ; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4
2306 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
2307 ; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4
2308 ; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7
2309 ; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12
2310 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
2311 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4
2312 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4
2313 ; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2314 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12
2315 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2316 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4
2317 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4
2318 ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9
2319 ; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14
2320 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
2321 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1
2322 ; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
2323 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4
2324 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4
2325 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2
2326 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
2327 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
2328 ; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14
2329 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2330 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
2331 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
2332 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
2333 ; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16
2334 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9
2335 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2336 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2
2337 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1
2338 ; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16
2339 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
2340 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2341 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10
2342 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2343 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
2344 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2345 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
2346 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2347 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
2348 ; GFX9-DL-NEXT: s_endpgm
2350 ; GFX10-DL-LABEL: udot8_acc16_vecMul:
2351 ; GFX10-DL: ; %bb.0: ; %entry
2352 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2353 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2354 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2355 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2356 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2357 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2358 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2359 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2360 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2361 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2362 ; GFX10-DL-NEXT: s_clause 0x1
2363 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2364 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2365 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2366 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
2367 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2368 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1
2369 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2370 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
2371 ; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4
2372 ; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
2373 ; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
2374 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6
2375 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
2376 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4
2377 ; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
2378 ; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8
2379 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6
2380 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5
2381 ; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4
2382 ; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12
2383 ; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
2384 ; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
2385 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
2386 ; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4
2387 ; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12
2388 ; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
2389 ; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11
2390 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4
2391 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2392 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
2393 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4
2394 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
2395 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9
2396 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
2397 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
2398 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
2399 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
2400 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2401 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11
2402 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7
2403 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
2404 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
2405 ; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8
2406 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
2407 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
2408 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
2409 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7
2410 ; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1
2411 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2412 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
2413 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2414 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
2415 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2416 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
2417 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
2418 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
2419 ; GFX10-DL-NEXT: s_endpgm
2420 <8 x i4> addrspace(1)* %src2,
2421 i16 addrspace(1)* nocapture %dst) {
2423 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2424 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2425 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2426 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2427 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2429 %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
2430 %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
2432 %mul = mul <8 x i16> %cvec1, %cvec2
2433 %mul0 = extractelement <8 x i16> %mul, i64 0
2434 %mul1 = extractelement <8 x i16> %mul, i64 1
2435 %mul2 = extractelement <8 x i16> %mul, i64 2
2436 %mul3 = extractelement <8 x i16> %mul, i64 3
2437 %mul4 = extractelement <8 x i16> %mul, i64 4
2438 %mul5 = extractelement <8 x i16> %mul, i64 5
2439 %mul6 = extractelement <8 x i16> %mul, i64 6
2440 %mul7 = extractelement <8 x i16> %mul, i64 7
2442 %acc = load i16, i16 addrspace(1)* %dst, align 4
2443 %add1 = add i16 %mul0, %acc
2444 %add2 = add i16 %add1, %mul1
2445 %add3 = add i16 %add2, %mul2
2446 %add4 = add i16 %add3, %mul3
2447 %add5 = add i16 %add4, %mul4
2448 %add6 = add i16 %add5, %mul5
2449 %add7 = add i16 %add6, %mul6
2450 %add8 = add i16 %add7, %mul7
2452 store i16 %add8, i16 addrspace(1)* %dst, align 4
2456 ; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
2457 define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2458 ; GFX7-LABEL: udot8_acc8_vecMul:
2459 ; GFX7: ; %bb.0: ; %entry
2460 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2461 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2462 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2463 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2464 ; GFX7-NEXT: s_mov_b32 s14, -1
2465 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2466 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2467 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2468 ; GFX7-NEXT: s_mov_b32 s10, 0
2469 ; GFX7-NEXT: s_mov_b32 s11, s3
2470 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2471 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2472 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2473 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2474 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2475 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2476 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2477 ; GFX7-NEXT: s_mov_b32 s2, -1
2478 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
2479 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2480 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2481 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2
2482 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2
2483 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 4
2484 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2
2485 ; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4
2486 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2
2487 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2488 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0
2489 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0
2490 ; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v8
2491 ; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v4
2492 ; GFX7-NEXT: v_and_b32_e32 v5, 15, v2
2493 ; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4
2494 ; GFX7-NEXT: v_and_b32_e32 v12, 15, v0
2495 ; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4
2496 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0
2497 ; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24
2498 ; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9
2499 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0
2500 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
2501 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
2502 ; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24
2503 ; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v11
2504 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
2505 ; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v15
2506 ; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9
2507 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2508 ; GFX7-NEXT: v_and_b32_e32 v0, 0xf0f, v0
2509 ; GFX7-NEXT: v_or_b32_e32 v8, v10, v8
2510 ; GFX7-NEXT: v_and_b32_e32 v2, 0xf0f, v2
2511 ; GFX7-NEXT: v_or_b32_e32 v4, v14, v4
2512 ; GFX7-NEXT: v_or_b32_e32 v6, v12, v6
2513 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
2514 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2515 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8
2516 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2517 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
2518 ; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
2519 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
2520 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v3
2521 ; GFX7-NEXT: v_and_b32_e32 v13, 15, v4
2522 ; GFX7-NEXT: v_bfe_u32 v8, v3, 8, 4
2523 ; GFX7-NEXT: v_bfe_u32 v14, v4, 8, 4
2524 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2525 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1
2526 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3
2527 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4
2528 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v4
2529 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 4
2530 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
2531 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
2532 ; GFX7-NEXT: v_and_b32_e32 v9, 15, v2
2533 ; GFX7-NEXT: v_and_b32_e32 v15, 15, v0
2534 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1
2535 ; GFX7-NEXT: v_bfe_u32 v10, v2, 8, 4
2536 ; GFX7-NEXT: v_bfe_u32 v16, v0, 8, 4
2537 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1
2538 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
2539 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4
2540 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0
2541 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4
2542 ; GFX7-NEXT: v_mad_u32_u24 v1, v10, v16, v1
2543 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2544 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0
2545 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2546 ; GFX7-NEXT: s_endpgm
2548 ; GFX8-LABEL: udot8_acc8_vecMul:
2549 ; GFX8: ; %bb.0: ; %entry
2550 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2551 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2552 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2553 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2554 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2555 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2556 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2557 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2558 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2559 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2560 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2561 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2562 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2563 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2564 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2565 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2566 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
2567 ; GFX8-NEXT: s_mov_b32 s10, -1
2568 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2569 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2570 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2571 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2572 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3
2573 ; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4
2574 ; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4
2575 ; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4
2576 ; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4
2577 ; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4
2578 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2579 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2
2580 ; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4
2581 ; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4
2582 ; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4
2583 ; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4
2584 ; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4
2585 ; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2586 ; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17
2587 ; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2588 ; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4
2589 ; GFX8-NEXT: v_and_b32_e32 v6, 15, v3
2590 ; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4
2591 ; GFX8-NEXT: v_and_b32_e32 v13, 15, v2
2592 ; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19
2593 ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15
2594 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2595 ; GFX8-NEXT: v_or_b32_e32 v9, v18, v9
2596 ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2597 ; GFX8-NEXT: v_or_b32_e32 v3, v2, v11
2598 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7
2599 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9
2600 ; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13
2601 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
2602 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2603 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v5
2604 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
2605 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
2606 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
2607 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
2608 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2609 ; GFX8-NEXT: v_add_u16_e32 v3, v6, v4
2610 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
2611 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
2612 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
2613 ; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2
2614 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
2615 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9
2616 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
2617 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v9
2618 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2619 ; GFX8-NEXT: s_endpgm
2621 ; GFX9-LABEL: udot8_acc8_vecMul:
2622 ; GFX9: ; %bb.0: ; %entry
2623 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2624 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2625 ; GFX9-NEXT: s_mov_b32 s10, -1
2626 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2627 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2628 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2629 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2630 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2631 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2632 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2633 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2634 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2635 ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]
2636 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2637 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2638 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1
2639 ; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4
2640 ; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4
2641 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2642 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2
2643 ; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
2644 ; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4
2645 ; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4
2646 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
2647 ; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
2648 ; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
2649 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4
2650 ; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4
2651 ; GFX9-NEXT: v_and_b32_e32 v12, 15, v2
2652 ; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4
2653 ; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4
2654 ; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4
2655 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2656 ; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16
2657 ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2658 ; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2
2659 ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14
2660 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2661 ; GFX9-NEXT: v_or_b32_e32 v8, v17, v8
2662 ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
2663 ; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2664 ; GFX9-NEXT: v_or_b32_e32 v1, v18, v10
2665 ; GFX9-NEXT: v_or_b32_e32 v6, v7, v6
2666 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8
2667 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
2668 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2669 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v12
2670 ; GFX9-NEXT: v_or_b32_e32 v7, v12, v0
2671 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1
2672 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
2673 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2674 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2675 ; GFX9-NEXT: v_add_u16_e32 v1, v5, v4
2676 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v7
2677 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
2678 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
2679 ; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
2680 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v10
2681 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8
2682 ; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
2683 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
2684 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
2685 ; GFX9-NEXT: s_endpgm
2687 ; GFX9-DL-LABEL: udot8_acc8_vecMul:
2688 ; GFX9-DL: ; %bb.0: ; %entry
2689 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2690 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2691 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2692 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2693 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2694 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2695 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2696 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2697 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
2698 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2699 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2700 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2701 ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]
2702 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2703 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2704 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
2705 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
2706 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
2707 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2708 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2
2709 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
2710 ; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4
2711 ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
2712 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
2713 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
2714 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
2715 ; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
2716 ; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
2717 ; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2
2718 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4
2719 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4
2720 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4
2721 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2722 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16
2723 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2724 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2
2725 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14
2726 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2727 ; GFX9-DL-NEXT: v_or_b32_e32 v8, v17, v8
2728 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
2729 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2730 ; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v10
2731 ; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6
2732 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v7, 16, v8
2733 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
2734 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2735 ; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12
2736 ; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v0
2737 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1
2738 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
2739 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2740 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2741 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v5, v4
2742 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7
2743 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
2744 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
2745 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
2746 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10
2747 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v8
2748 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
2749 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
2750 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
2751 ; GFX9-DL-NEXT: s_endpgm
2753 ; GFX10-DL-LABEL: udot8_acc8_vecMul:
2754 ; GFX10-DL: ; %bb.0: ; %entry
2755 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2756 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2757 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2758 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
2759 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2760 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2761 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2762 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2763 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2764 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2765 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2766 ; GFX10-DL-NEXT: s_clause 0x1
2767 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2768 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2769 ; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
2770 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2771 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
2772 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2773 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
2774 ; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
2775 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
2776 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
2777 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
2778 ; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v10
2779 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
2780 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1
2781 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
2782 ; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
2783 ; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
2784 ; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
2785 ; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v13
2786 ; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6
2787 ; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2
2788 ; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 24, 4
2789 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
2790 ; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 16, 4
2791 ; GFX10-DL-NEXT: v_mul_lo_u16 v2, v8, v14
2792 ; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v1
2793 ; GFX10-DL-NEXT: v_or_b32_e32 v6, v7, v6
2794 ; GFX10-DL-NEXT: v_mul_lo_u16 v1, v11, v13
2795 ; GFX10-DL-NEXT: v_mul_lo_u16 v7, v9, v15
2796 ; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2
2797 ; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v0
2798 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
2799 ; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10
2800 ; GFX10-DL-NEXT: v_mul_lo_u16 v10, v12, v16
2801 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1
2802 ; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v2
2803 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2804 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v8
2805 ; GFX10-DL-NEXT: v_or_b32_e32 v1, v10, v1
2806 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v7
2807 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2
2808 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2809 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
2810 ; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2811 ; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v2
2812 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
2813 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
2814 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6
2815 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
2816 ; GFX10-DL-NEXT: v_mad_u16 v0, v12, v16, v0
2817 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
2818 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7
2819 ; GFX10-DL-NEXT: v_mad_u16 v0, v9, v15, v0
2820 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
2821 ; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
2822 ; GFX10-DL-NEXT: s_endpgm
2823 <8 x i4> addrspace(1)* %src2,
2824 i8 addrspace(1)* nocapture %dst) {
2826 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2827 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2828 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2829 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2830 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2832 %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
2833 %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
2835 %mul = mul <8 x i8> %cvec1, %cvec2
2836 %mul0 = extractelement <8 x i8> %mul, i64 0
2837 %mul1 = extractelement <8 x i8> %mul, i64 1
2838 %mul2 = extractelement <8 x i8> %mul, i64 2
2839 %mul3 = extractelement <8 x i8> %mul, i64 3
2840 %mul4 = extractelement <8 x i8> %mul, i64 4
2841 %mul5 = extractelement <8 x i8> %mul, i64 5
2842 %mul6 = extractelement <8 x i8> %mul, i64 6
2843 %mul7 = extractelement <8 x i8> %mul, i64 7
2845 %acc = load i8, i8 addrspace(1)* %dst, align 4
2846 %add1 = add i8 %mul0, %acc
2847 %add2 = add i8 %add1, %mul1
2848 %add3 = add i8 %add2, %mul2
2849 %add4 = add i8 %add3, %mul3
2850 %add5 = add i8 %add4, %mul4
2851 %add6 = add i8 %add5, %mul5
2852 %add7 = add i8 %add6, %mul6
2853 %add8 = add i8 %add7, %mul7
2855 store i8 %add8, i8 addrspace(1)* %dst, align 4
2859 ; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
2860 define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
2861 ; GFX7-LABEL: udot8_acc4_vecMul:
2862 ; GFX7: ; %bb.0: ; %entry
2863 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2864 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2865 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2866 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2867 ; GFX7-NEXT: s_mov_b32 s14, -1
2868 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2869 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2870 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2871 ; GFX7-NEXT: s_mov_b32 s10, 0
2872 ; GFX7-NEXT: s_mov_b32 s11, s3
2873 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2874 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2875 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2876 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2877 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2878 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2879 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2880 ; GFX7-NEXT: s_mov_b32 s2, -1
2881 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
2882 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2883 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2884 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
2885 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
2886 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
2887 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
2888 ; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
2889 ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
2890 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
2891 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
2892 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2893 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
2894 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
2895 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
2896 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
2897 ; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
2898 ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
2899 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
2900 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2901 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2902 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2903 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
2904 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
2905 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
2906 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
2907 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
2908 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2909 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2910 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2911 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2912 ; GFX7-NEXT: s_endpgm
2914 ; GFX8-LABEL: udot8_acc4_vecMul:
2915 ; GFX8: ; %bb.0: ; %entry
2916 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2917 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2918 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2919 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2920 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2921 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2922 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2923 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2924 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2925 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2926 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2927 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2928 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2929 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2930 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2931 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2932 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
2933 ; GFX8-NEXT: s_mov_b32 s10, -1
2934 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2935 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2936 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2937 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2938 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
2939 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
2940 ; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
2941 ; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
2942 ; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
2943 ; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
2944 ; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
2945 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
2946 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2947 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
2948 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
2949 ; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
2950 ; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
2951 ; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
2952 ; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
2953 ; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
2954 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
2955 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2956 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2957 ; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
2958 ; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
2959 ; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
2960 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
2961 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
2962 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
2963 ; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
2964 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
2965 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2966 ; GFX8-NEXT: s_endpgm
2968 ; GFX9-LABEL: udot8_acc4_vecMul:
2969 ; GFX9: ; %bb.0: ; %entry
2970 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2971 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2972 ; GFX9-NEXT: s_mov_b32 s10, -1
2973 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2974 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2975 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2976 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2977 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2978 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2979 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2980 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2981 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2982 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2983 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
2984 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2985 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
2986 ; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
2987 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2988 ; GFX9-NEXT: v_and_b32_e32 v12, 15, v2
2989 ; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4
2990 ; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
2991 ; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4
2992 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7
2993 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12
2994 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5
2995 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4
2996 ; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4
2997 ; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2998 ; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12
2999 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5
3000 ; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4
3001 ; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4
3002 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9
3003 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
3004 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7
3005 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1
3006 ; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4
3007 ; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4
3008 ; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4
3009 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2
3010 ; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4
3011 ; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9
3012 ; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14
3013 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3014 ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
3015 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
3016 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
3017 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16
3018 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9
3019 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3020 ; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
3021 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1
3022 ; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16
3023 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
3024 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
3025 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10
3026 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3027 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
3028 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3029 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
3030 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3031 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
3032 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
3033 ; GFX9-NEXT: s_endpgm
3035 ; GFX9-DL-LABEL: udot8_acc4_vecMul:
3036 ; GFX9-DL: ; %bb.0: ; %entry
3037 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3038 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3039 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
3040 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
3041 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
3042 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3043 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3044 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3045 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
3046 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3047 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
3048 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
3049 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
3050 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
3051 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
3052 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
3053 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
3054 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3055 ; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2
3056 ; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4
3057 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
3058 ; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4
3059 ; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7
3060 ; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12
3061 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
3062 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4
3063 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4
3064 ; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
3065 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12
3066 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
3067 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4
3068 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4
3069 ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9
3070 ; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14
3071 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
3072 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1
3073 ; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
3074 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4
3075 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4
3076 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2
3077 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
3078 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
3079 ; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14
3080 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3081 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
3082 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
3083 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
3084 ; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16
3085 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9
3086 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3087 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2
3088 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1
3089 ; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16
3090 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
3091 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
3092 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10
3093 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3094 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
3095 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3096 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
3097 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3098 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
3099 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
3100 ; GFX9-DL-NEXT: s_endpgm
3102 ; GFX10-DL-LABEL: udot8_acc4_vecMul:
3103 ; GFX10-DL: ; %bb.0: ; %entry
3104 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3105 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3106 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3107 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3108 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3109 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
3110 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
3111 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
3112 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
3113 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3114 ; GFX10-DL-NEXT: s_clause 0x1
3115 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
3116 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
3117 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3118 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
3119 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
3120 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1
3121 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3122 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
3123 ; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4
3124 ; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
3125 ; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
3126 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6
3127 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
3128 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4
3129 ; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
3130 ; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8
3131 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6
3132 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5
3133 ; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4
3134 ; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12
3135 ; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
3136 ; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
3137 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
3138 ; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4
3139 ; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12
3140 ; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
3141 ; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11
3142 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4
3143 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3144 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
3145 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4
3146 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
3147 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9
3148 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3149 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
3150 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
3151 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
3152 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
3153 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11
3154 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7
3155 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
3156 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3157 ; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8
3158 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
3159 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
3160 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
3161 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7
3162 ; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1
3163 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
3164 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
3165 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
3166 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
3167 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3168 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
3169 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
3170 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
3171 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
3172 ; GFX10-DL-NEXT: s_endpgm
3173 <8 x i4> addrspace(1)* %src2,
3174 i4 addrspace(1)* nocapture %dst) {
3176 %idx = call i32 @llvm.amdgcn.workitem.id.x()
3177 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3178 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3179 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3180 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3182 %mul = mul <8 x i4> %vec1, %vec2
3183 %mul0 = extractelement <8 x i4> %mul, i64 0
3184 %mul1 = extractelement <8 x i4> %mul, i64 1
3185 %mul2 = extractelement <8 x i4> %mul, i64 2
3186 %mul3 = extractelement <8 x i4> %mul, i64 3
3187 %mul4 = extractelement <8 x i4> %mul, i64 4
3188 %mul5 = extractelement <8 x i4> %mul, i64 5
3189 %mul6 = extractelement <8 x i4> %mul, i64 6
3190 %mul7 = extractelement <8 x i4> %mul, i64 7
3192 %acc = load i4, i4 addrspace(1)* %dst, align 4
3193 %add1 = add i4 %mul0, %acc
3194 %add2 = add i4 %add1, %mul1
3195 %add3 = add i4 %add2, %mul2
3196 %add4 = add i4 %add3, %mul3
3197 %add5 = add i4 %add4, %mul4
3198 %add6 = add i4 %add5, %mul5
3199 %add7 = add i4 %add6, %mul6
3200 %add8 = add i4 %add7, %mul7
3202 store i4 %add8, i4 addrspace(1)* %dst, align 4
3206 define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
3207 ; GFX7-LABEL: udot8_variant1:
3208 ; GFX7: ; %bb.0: ; %entry
3209 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3210 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3211 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
3212 ; GFX7-NEXT: s_mov_b32 s10, 0
3213 ; GFX7-NEXT: s_mov_b32 s11, s3
3214 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3215 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
3216 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3217 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
3218 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
3219 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
3220 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
3221 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
3222 ; GFX7-NEXT: s_mov_b32 s2, -1
3223 ; GFX7-NEXT: s_waitcnt vmcnt(1)
3224 ; GFX7-NEXT: v_and_b32_e32 v1, 15, v2
3225 ; GFX7-NEXT: v_bfe_u32 v3, v2, 4, 4
3226 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3227 ; GFX7-NEXT: v_and_b32_e32 v9, 15, v0
3228 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4
3229 ; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4
3230 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
3231 ; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4
3232 ; GFX7-NEXT: v_bfe_u32 v8, v2, 24, 4
3233 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3234 ; GFX7-NEXT: v_bfe_u32 v10, v0, 4, 4
3235 ; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4
3236 ; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
3237 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
3238 ; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4
3239 ; GFX7-NEXT: v_bfe_u32 v15, v0, 24, 4
3240 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0
3241 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3242 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s4
3243 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
3244 ; GFX7-NEXT: v_mad_u32_u24 v0, v10, v3, v0
3245 ; GFX7-NEXT: v_mad_u32_u24 v0, v11, v4, v0
3246 ; GFX7-NEXT: v_mad_u32_u24 v0, v12, v5, v0
3247 ; GFX7-NEXT: v_mad_u32_u24 v0, v13, v6, v0
3248 ; GFX7-NEXT: v_mad_u32_u24 v0, v14, v7, v0
3249 ; GFX7-NEXT: v_mad_u32_u24 v0, v15, v8, v0
3250 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
3251 ; GFX7-NEXT: s_endpgm
3253 ; GFX8-LABEL: udot8_variant1:
3254 ; GFX8: ; %bb.0: ; %entry
3255 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3256 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3257 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3258 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3259 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
3260 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
3261 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3262 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
3263 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
3264 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
3265 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3266 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
3267 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
3268 ; GFX8-NEXT: s_waitcnt vmcnt(1)
3269 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v3
3270 ; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4
3271 ; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4
3272 ; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4
3273 ; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4
3274 ; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4
3275 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3276 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
3277 ; GFX8-NEXT: v_bfe_u32 v5, v0, 4, 4
3278 ; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4
3279 ; GFX8-NEXT: v_bfe_u32 v9, v0, 12, 4
3280 ; GFX8-NEXT: v_bfe_u32 v11, v0, 16, 4
3281 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
3282 ; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4
3283 ; GFX8-NEXT: v_bfe_u32 v15, v0, 24, 4
3284 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
3285 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
3286 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3287 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s2
3288 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1
3289 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0
3290 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0
3291 ; GFX8-NEXT: v_mad_u32_u24 v0, v9, v8, v0
3292 ; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0
3293 ; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0
3294 ; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0
3295 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3296 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3297 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3298 ; GFX8-NEXT: s_endpgm
3300 ; GFX9-LABEL: udot8_variant1:
3301 ; GFX9: ; %bb.0: ; %entry
3302 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3303 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3304 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3305 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3306 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
3307 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
3308 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
3309 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3310 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3311 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v1
3312 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3313 ; GFX9-NEXT: v_and_b32_e32 v4, 15, v2
3314 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4
3315 ; GFX9-NEXT: v_bfe_u32 v6, v2, 4, 4
3316 ; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
3317 ; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4
3318 ; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4
3319 ; GFX9-NEXT: v_bfe_u32 v10, v2, 12, 4
3320 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4
3321 ; GFX9-NEXT: v_bfe_u32 v12, v2, 16, 4
3322 ; GFX9-NEXT: v_bfe_u32 v13, v1, 20, 4
3323 ; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4
3324 ; GFX9-NEXT: v_bfe_u32 v15, v1, 24, 4
3325 ; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
3326 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1
3327 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3328 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v4, v3
3329 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v2, v1
3330 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5
3331 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7
3332 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3333 ; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1
3334 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9
3335 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11
3336 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5
3337 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v14, v13
3338 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15
3339 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7
3340 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9
3341 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
3342 ; GFX9-NEXT: s_endpgm
3344 ; GFX9-DL-LABEL: udot8_variant1:
3345 ; GFX9-DL: ; %bb.0: ; %entry
3346 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3347 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3348 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3349 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3350 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
3351 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
3352 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
3353 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
3354 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3355 ; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
3356 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
3357 ; GFX9-DL-NEXT: s_endpgm
3359 ; GFX10-DL-LABEL: udot8_variant1:
3360 ; GFX10-DL: ; %bb.0: ; %entry
3361 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3362 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3363 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3364 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3365 ; GFX10-DL-NEXT: s_clause 0x1
3366 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
3367 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
3368 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3369 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
3370 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3371 ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2
3372 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
3373 ; GFX10-DL-NEXT: s_endpgm
3374 i32 addrspace(1)* %v2addr,
3375 i32 addrspace(1)* %dst) {
3377 %idx = call i32 @llvm.amdgcn.workitem.id.x()
3378 %gep1 = getelementptr i32, i32 addrspace(1)* %v1addr, i32 %idx
3379 %v1 = load i32, i32 addrspace(1)* %gep1, align 4
3380 %gep2 = getelementptr i32, i32 addrspace(1)* %v2addr, i32 %idx
3381 %v2 = load i32, i32 addrspace(1)* %gep2, align 4
3382 %and = and i32 %v1, 15
3383 %and1 = and i32 %v2, 15
3384 %mul1 = mul nuw nsw i32 %and1, %and
3386 %shr = lshr i32 %v1, 4
3387 %and2 = and i32 %shr, 15
3388 %shr3 = lshr i32 %v2, 4
3389 %and4 = and i32 %shr3, 15
3390 %mul2 = mul nuw nsw i32 %and4, %and2
3392 %shr6 = lshr i32 %v1, 8
3393 %and7 = and i32 %shr6, 15
3394 %shr8 = lshr i32 %v2, 8
3395 %and9 = and i32 %shr8, 15
3396 %mul3 = mul nuw nsw i32 %and9, %and7
3398 %shr12 = lshr i32 %v1, 12
3399 %and13 = and i32 %shr12, 15
3400 %shr14 = lshr i32 %v2, 12
3401 %and15 = and i32 %shr14, 15
3402 %mul4 = mul nuw nsw i32 %and15, %and13
3404 %shr18 = lshr i32 %v1, 16
3405 %and19 = and i32 %shr18, 15
3406 %shr20 = lshr i32 %v2, 16
3407 %and21 = and i32 %shr20, 15
3408 %mul5 = mul nuw nsw i32 %and21, %and19
3410 %shr24 = lshr i32 %v1, 20
3411 %and25 = and i32 %shr24, 15
3412 %shr26 = lshr i32 %v2, 20
3413 %and27 = and i32 %shr26, 15
3414 %mul6 = mul nuw nsw i32 %and27, %and25
3416 %shr30 = lshr i32 %v1, 24
3417 %and31 = and i32 %shr30, 15
3418 %shr32 = lshr i32 %v2, 24
3419 %and33 = and i32 %shr32, 15
3420 %mul7 = mul nuw nsw i32 %and33, %and31
3422 %shr36 = lshr i32 %v1, 28
3423 %shr37 = lshr i32 %v2, 28
3424 %mul8 = mul nuw nsw i32 %shr37, %shr36
3425 %acc = load i32, i32 addrspace(1)* %dst, align 4
3427 %add1 = add i32 %mul1, %acc
3428 %add2 = add i32 %add1, %mul8
3429 %add3 = add i32 %add2, %mul2
3430 %add4 = add i32 %add3, %mul3
3431 %add5 = add i32 %add4, %mul4
3432 %add6 = add i32 %add5, %mul5
3433 %add7 = add i32 %add6, %mul6
3434 %add8 = add i32 %add7, %mul7
3435 store i32 %add8, i32 addrspace(1)* %dst, align 4
3439 declare i32 @llvm.amdgcn.workitem.id.x()