1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
9 define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
10 ; GFX7-LABEL: udot8_acc32:
11 ; GFX7: ; %bb.0: ; %entry
12 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
13 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
14 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
15 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
16 ; GFX7-NEXT: s_mov_b32 s14, -1
17 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
18 ; GFX7-NEXT: s_add_u32 s12, s12, s3
19 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
20 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
21 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
22 ; GFX7-NEXT: s_mov_b32 s10, 0
23 ; GFX7-NEXT: s_mov_b32 s11, s3
24 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
25 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
26 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
27 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
28 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
29 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
30 ; GFX7-NEXT: s_mov_b32 s2, -1
31 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
32 ; GFX7-NEXT: s_waitcnt vmcnt(1)
33 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
34 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
35 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
36 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
37 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
38 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
39 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
40 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
41 ; GFX7-NEXT: s_waitcnt vmcnt(0)
42 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
43 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
44 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
45 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
46 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
47 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
48 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
49 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
51 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
52 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
53 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
54 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
55 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
56 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
57 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
58 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
59 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
62 ; GFX8-LABEL: udot8_acc32:
63 ; GFX8: ; %bb.0: ; %entry
64 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
65 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
66 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
67 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
68 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
71 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
72 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
73 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
74 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
75 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
76 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
77 ; GFX8-NEXT: flat_load_dword v1, v[2:3]
78 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
79 ; GFX8-NEXT: s_mov_b32 s10, -1
80 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
81 ; GFX8-NEXT: s_add_u32 s8, s8, s3
82 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
83 ; GFX8-NEXT: s_waitcnt vmcnt(1)
84 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0
85 ; GFX8-NEXT: v_bfe_u32 v3, v0, 24, 4
86 ; GFX8-NEXT: v_bfe_u32 v4, v0, 20, 4
87 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 4
88 ; GFX8-NEXT: v_bfe_u32 v6, v0, 12, 4
89 ; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4
90 ; GFX8-NEXT: v_bfe_u32 v8, v0, 4, 4
91 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
92 ; GFX8-NEXT: s_waitcnt vmcnt(0)
93 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v1
94 ; GFX8-NEXT: v_bfe_u32 v10, v1, 24, 4
95 ; GFX8-NEXT: v_bfe_u32 v11, v1, 20, 4
96 ; GFX8-NEXT: v_bfe_u32 v12, v1, 16, 4
97 ; GFX8-NEXT: v_bfe_u32 v13, v1, 12, 4
98 ; GFX8-NEXT: v_bfe_u32 v14, v1, 8, 4
99 ; GFX8-NEXT: v_bfe_u32 v15, v1, 4, 4
100 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1
101 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, s2
103 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
104 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
105 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
106 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0
107 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
108 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v10, v0
109 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v9, v0
110 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
111 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
112 ; GFX8-NEXT: flat_store_dword v[0:1], v2
113 ; GFX8-NEXT: s_endpgm
115 ; GFX9-LABEL: udot8_acc32:
116 ; GFX9: ; %bb.0: ; %entry
117 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
118 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
119 ; GFX9-NEXT: s_mov_b32 s10, -1
120 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
121 ; GFX9-NEXT: s_add_u32 s8, s8, s3
122 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
123 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
124 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
125 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
126 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
128 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
129 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
130 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
131 ; GFX9-NEXT: s_waitcnt vmcnt(1)
132 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
134 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
135 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4
136 ; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4
137 ; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4
138 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
139 ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4
140 ; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4
141 ; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
142 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
143 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
144 ; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4
145 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
146 ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4
147 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
148 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
149 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
150 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16
151 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
152 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
153 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
155 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
156 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
157 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
158 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
159 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10
160 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
161 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
162 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
163 ; GFX9-NEXT: s_endpgm
165 ; GFX9-DL-LABEL: udot8_acc32:
166 ; GFX9-DL: ; %bb.0: ; %entry
167 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
168 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
169 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
170 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
171 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
172 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
173 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
174 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
175 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
176 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
177 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
178 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
179 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
180 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
181 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
182 ; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v2, v3, s0
183 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
184 ; GFX9-DL-NEXT: s_endpgm
186 ; GFX10-DL-LABEL: udot8_acc32:
187 ; GFX10-DL: ; %bb.0: ; %entry
188 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
189 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
190 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
191 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
192 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
193 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
194 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
195 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
196 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
197 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX10-DL-NEXT: s_clause 0x1
199 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
200 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
201 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
202 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
203 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
204 ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
205 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
206 ; GFX10-DL-NEXT: s_endpgm
207 <8 x i4> addrspace(1)* %src2,
208 i32 addrspace(1)* nocapture %dst) {
210 %idx = call i32 @llvm.amdgcn.workitem.id.x()
211 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
212 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
213 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
214 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
216 %v1e0 = extractelement <8 x i4> %vec1, i64 0
217 %cv1e0 = zext i4 %v1e0 to i32
218 %v2e0 = extractelement <8 x i4> %vec2, i64 0
219 %cv2e0 = zext i4 %v2e0 to i32
220 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
222 %v1e1 = extractelement <8 x i4> %vec1, i64 1
223 %cv1e1 = zext i4 %v1e1 to i32
224 %v2e1 = extractelement <8 x i4> %vec2, i64 1
225 %cv2e1 = zext i4 %v2e1 to i32
226 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
228 %v1e2 = extractelement <8 x i4> %vec1, i64 2
229 %cv1e2 = zext i4 %v1e2 to i32
230 %v2e2 = extractelement <8 x i4> %vec2, i64 2
231 %cv2e2 = zext i4 %v2e2 to i32
232 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
234 %v1e3 = extractelement <8 x i4> %vec1, i64 3
235 %cv1e3 = zext i4 %v1e3 to i32
236 %v2e3 = extractelement <8 x i4> %vec2, i64 3
237 %cv2e3 = zext i4 %v2e3 to i32
238 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
240 %v1e4 = extractelement <8 x i4> %vec1, i64 4
241 %cv1e4 = zext i4 %v1e4 to i32
242 %v2e4 = extractelement <8 x i4> %vec2, i64 4
243 %cv2e4 = zext i4 %v2e4 to i32
244 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
246 %v1e5 = extractelement <8 x i4> %vec1, i64 5
247 %cv1e5 = zext i4 %v1e5 to i32
248 %v2e5 = extractelement <8 x i4> %vec2, i64 5
249 %cv2e5 = zext i4 %v2e5 to i32
250 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
252 %v1e6 = extractelement <8 x i4> %vec1, i64 6
253 %cv1e6 = zext i4 %v1e6 to i32
254 %v2e6 = extractelement <8 x i4> %vec2, i64 6
255 %cv2e6 = zext i4 %v2e6 to i32
256 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
258 %v1e7 = extractelement <8 x i4> %vec1, i64 7
259 %cv1e7 = zext i4 %v1e7 to i32
260 %v2e7 = extractelement <8 x i4> %vec2, i64 7
261 %cv2e7 = zext i4 %v2e7 to i32
262 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
264 %acc = load i32, i32 addrspace(1)* %dst, align 4
265 %add1 = add i32 %mul0, %acc
266 %add2 = add i32 %add1, %mul1
267 %add3 = add i32 %add2, %mul2
268 %add4 = add i32 %add3, %mul3
269 %add5 = add i32 %add4, %mul4
270 %add6 = add i32 %add5, %mul5
271 %add7 = add i32 %add6, %mul6
272 %add8 = add i32 %add7, %mul7
274 store i32 %add8, i32 addrspace(1)* %dst, align 4
278 ; TODO: Remove the unnecessary instruction(that is zero-extending the
279 ; 2nd MAD) to have the pattern-recognizer to kick in.
280 define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
281 ; GFX7-LABEL: udot8_acc16:
282 ; GFX7: ; %bb.0: ; %entry
283 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
284 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
285 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
286 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
287 ; GFX7-NEXT: s_mov_b32 s14, -1
288 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
289 ; GFX7-NEXT: s_add_u32 s12, s12, s3
290 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
291 ; GFX7-NEXT: s_mov_b32 s10, 0
292 ; GFX7-NEXT: s_mov_b32 s11, s3
293 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
295 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
296 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
297 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
298 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
299 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
300 ; GFX7-NEXT: s_mov_b32 s2, -1
301 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0
302 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
303 ; GFX7-NEXT: s_waitcnt vmcnt(2)
304 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
305 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
306 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
307 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
308 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
309 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
310 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
311 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
312 ; GFX7-NEXT: s_waitcnt vmcnt(1)
313 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
314 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
315 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
316 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
317 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
318 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
319 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
320 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
321 ; GFX7-NEXT: s_waitcnt vmcnt(0)
322 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
323 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
324 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
325 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
326 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
327 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
328 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
329 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
330 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
331 ; GFX7-NEXT: s_endpgm
333 ; GFX8-LABEL: udot8_acc16:
334 ; GFX8: ; %bb.0: ; %entry
335 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
336 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
337 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
338 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
339 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
340 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
342 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
343 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
344 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
345 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
346 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
347 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
348 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
349 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
350 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
351 ; GFX8-NEXT: flat_load_ushort v18, v[2:3]
352 ; GFX8-NEXT: s_mov_b32 s10, -1
353 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
354 ; GFX8-NEXT: s_add_u32 s8, s8, s3
355 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
356 ; GFX8-NEXT: s_waitcnt vmcnt(2)
357 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4
358 ; GFX8-NEXT: v_bfe_u32 v5, v4, 24, 4
359 ; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4
360 ; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4
361 ; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4
362 ; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4
363 ; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4
364 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
365 ; GFX8-NEXT: s_waitcnt vmcnt(1)
366 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0
367 ; GFX8-NEXT: v_bfe_u32 v12, v0, 24, 4
368 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
369 ; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4
370 ; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4
371 ; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4
372 ; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4
373 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
374 ; GFX8-NEXT: s_waitcnt vmcnt(0)
375 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18
376 ; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0
377 ; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0
378 ; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0
379 ; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0
380 ; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0
381 ; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0
382 ; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0
383 ; GFX8-NEXT: flat_store_short v[2:3], v0
384 ; GFX8-NEXT: s_endpgm
386 ; GFX9-LABEL: udot8_acc16:
387 ; GFX9: ; %bb.0: ; %entry
388 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
389 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
390 ; GFX9-NEXT: s_mov_b32 s10, -1
391 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
392 ; GFX9-NEXT: s_add_u32 s8, s8, s3
393 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
394 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
395 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
396 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
397 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
398 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
400 ; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
401 ; GFX9-NEXT: global_load_ushort v17, v1, s[2:3]
402 ; GFX9-NEXT: s_waitcnt vmcnt(2)
403 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2
404 ; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4
405 ; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4
406 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4
407 ; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4
408 ; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4
409 ; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4
410 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
411 ; GFX9-NEXT: s_waitcnt vmcnt(1)
412 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3
413 ; GFX9-NEXT: v_bfe_u32 v11, v3, 24, 4
414 ; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4
415 ; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4
416 ; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4
417 ; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4
418 ; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4
419 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
421 ; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
422 ; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
423 ; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
424 ; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
425 ; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
426 ; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
427 ; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
428 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
429 ; GFX9-NEXT: global_store_short v1, v0, s[2:3]
430 ; GFX9-NEXT: s_endpgm
432 ; GFX9-DL-LABEL: udot8_acc16:
433 ; GFX9-DL: ; %bb.0: ; %entry
434 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
435 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
436 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
437 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
438 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
439 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
440 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
441 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
442 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
443 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
444 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
445 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
446 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
447 ; GFX9-DL-NEXT: global_load_ushort v17, v1, s[2:3]
448 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
449 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2
450 ; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
451 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
452 ; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
453 ; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4
454 ; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
455 ; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
456 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
457 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
458 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3
459 ; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 24, 4
460 ; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4
461 ; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4
462 ; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4
463 ; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4
464 ; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4
465 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
466 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
467 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
468 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
469 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
470 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
471 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
472 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
473 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
474 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
475 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3]
476 ; GFX9-DL-NEXT: s_endpgm
478 ; GFX10-DL-LABEL: udot8_acc16:
479 ; GFX10-DL: ; %bb.0: ; %entry
480 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
481 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
482 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
483 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
484 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
485 ; GFX10-DL-NEXT: s_clause 0x1
486 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
487 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
488 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
489 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
490 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
491 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX10-DL-NEXT: s_clause 0x1
493 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
494 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
495 ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3]
496 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
497 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
498 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
499 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
500 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
501 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
502 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
503 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
504 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
505 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
506 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
507 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
508 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
509 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
510 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
511 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
512 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
513 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
514 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
515 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
516 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
517 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
518 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
519 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
520 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
521 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
522 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
523 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3]
524 ; GFX10-DL-NEXT: s_endpgm
525 <8 x i4> addrspace(1)* %src2,
526 i16 addrspace(1)* nocapture %dst) {
528 %idx = call i32 @llvm.amdgcn.workitem.id.x()
529 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
530 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
531 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
532 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
534 %v1e0 = extractelement <8 x i4> %vec1, i64 0
535 %cv1e0 = zext i4 %v1e0 to i16
536 %v2e0 = extractelement <8 x i4> %vec2, i64 0
537 %cv2e0 = zext i4 %v2e0 to i16
538 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
540 %v1e1 = extractelement <8 x i4> %vec1, i64 1
541 %cv1e1 = zext i4 %v1e1 to i16
542 %v2e1 = extractelement <8 x i4> %vec2, i64 1
543 %cv2e1 = zext i4 %v2e1 to i16
544 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
546 %v1e2 = extractelement <8 x i4> %vec1, i64 2
547 %cv1e2 = zext i4 %v1e2 to i16
548 %v2e2 = extractelement <8 x i4> %vec2, i64 2
549 %cv2e2 = zext i4 %v2e2 to i16
550 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
552 %v1e3 = extractelement <8 x i4> %vec1, i64 3
553 %cv1e3 = zext i4 %v1e3 to i16
554 %v2e3 = extractelement <8 x i4> %vec2, i64 3
555 %cv2e3 = zext i4 %v2e3 to i16
556 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
558 %v1e4 = extractelement <8 x i4> %vec1, i64 4
559 %cv1e4 = zext i4 %v1e4 to i16
560 %v2e4 = extractelement <8 x i4> %vec2, i64 4
561 %cv2e4 = zext i4 %v2e4 to i16
562 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
564 %v1e5 = extractelement <8 x i4> %vec1, i64 5
565 %cv1e5 = zext i4 %v1e5 to i16
566 %v2e5 = extractelement <8 x i4> %vec2, i64 5
567 %cv2e5 = zext i4 %v2e5 to i16
568 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
570 %v1e6 = extractelement <8 x i4> %vec1, i64 6
571 %cv1e6 = zext i4 %v1e6 to i16
572 %v2e6 = extractelement <8 x i4> %vec2, i64 6
573 %cv2e6 = zext i4 %v2e6 to i16
574 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
576 %v1e7 = extractelement <8 x i4> %vec1, i64 7
577 %cv1e7 = zext i4 %v1e7 to i16
578 %v2e7 = extractelement <8 x i4> %vec2, i64 7
579 %cv2e7 = zext i4 %v2e7 to i16
580 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
582 %acc = load i16, i16 addrspace(1)* %dst, align 4
583 %add1 = add i16 %mul0, %acc
584 %add2 = add i16 %add1, %mul1
585 %add3 = add i16 %add2, %mul2
586 %add4 = add i16 %add3, %mul3
587 %add5 = add i16 %add4, %mul4
588 %add6 = add i16 %add5, %mul5
589 %add7 = add i16 %add6, %mul6
590 %add8 = add i16 %add7, %mul7
592 store i16 %add8, i16 addrspace(1)* %dst, align 4
596 ; TODO: Remove the unnecessary instruction(that is zero-extending the
597 ; 2nd MAD) to have the pattern-recognizer to kick in.
598 define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
599 ; GFX7-LABEL: udot8_acc8:
600 ; GFX7: ; %bb.0: ; %entry
601 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
602 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
603 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
604 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
605 ; GFX7-NEXT: s_mov_b32 s14, -1
606 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
607 ; GFX7-NEXT: s_add_u32 s12, s12, s3
608 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
609 ; GFX7-NEXT: s_mov_b32 s10, 0
610 ; GFX7-NEXT: s_mov_b32 s11, s3
611 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
612 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
613 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
614 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
615 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
616 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
617 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
618 ; GFX7-NEXT: s_mov_b32 s2, -1
619 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0
620 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
621 ; GFX7-NEXT: s_waitcnt vmcnt(2)
622 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
623 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
624 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
625 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
626 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
627 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
628 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
629 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
630 ; GFX7-NEXT: s_waitcnt vmcnt(1)
631 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
632 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
633 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
634 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
635 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
636 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
637 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
638 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
639 ; GFX7-NEXT: s_waitcnt vmcnt(0)
640 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
641 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
642 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
643 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
644 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
645 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
646 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
647 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
648 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
649 ; GFX7-NEXT: s_endpgm
651 ; GFX8-LABEL: udot8_acc8:
652 ; GFX8: ; %bb.0: ; %entry
653 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
654 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
655 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
656 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
657 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
658 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
659 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
660 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
661 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
662 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
663 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
664 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
665 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
666 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
667 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
668 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
669 ; GFX8-NEXT: flat_load_ubyte v18, v[2:3]
670 ; GFX8-NEXT: s_mov_b32 s10, -1
671 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
672 ; GFX8-NEXT: s_add_u32 s8, s8, s3
673 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
674 ; GFX8-NEXT: s_waitcnt vmcnt(2)
675 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4
676 ; GFX8-NEXT: v_bfe_u32 v5, v4, 24, 4
677 ; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4
678 ; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4
679 ; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4
680 ; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4
681 ; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4
682 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
683 ; GFX8-NEXT: s_waitcnt vmcnt(1)
684 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0
685 ; GFX8-NEXT: v_bfe_u32 v12, v0, 24, 4
686 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
687 ; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4
688 ; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4
689 ; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4
690 ; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4
691 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
692 ; GFX8-NEXT: s_waitcnt vmcnt(0)
693 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18
694 ; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0
695 ; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0
696 ; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0
697 ; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0
698 ; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0
699 ; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0
700 ; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0
701 ; GFX8-NEXT: flat_store_byte v[2:3], v0
702 ; GFX8-NEXT: s_endpgm
704 ; GFX9-LABEL: udot8_acc8:
705 ; GFX9: ; %bb.0: ; %entry
706 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
707 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
708 ; GFX9-NEXT: s_mov_b32 s10, -1
709 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
710 ; GFX9-NEXT: s_add_u32 s8, s8, s3
711 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
712 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
713 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
714 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
715 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
716 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
718 ; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
719 ; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3]
720 ; GFX9-NEXT: s_waitcnt vmcnt(2)
721 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2
722 ; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4
723 ; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4
724 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4
725 ; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4
726 ; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4
727 ; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4
728 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
729 ; GFX9-NEXT: s_waitcnt vmcnt(1)
730 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3
731 ; GFX9-NEXT: v_bfe_u32 v11, v3, 24, 4
732 ; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4
733 ; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4
734 ; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4
735 ; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4
736 ; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4
737 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
739 ; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
740 ; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
741 ; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
742 ; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
743 ; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
744 ; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
745 ; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
746 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
747 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
748 ; GFX9-NEXT: s_endpgm
750 ; GFX9-DL-LABEL: udot8_acc8:
751 ; GFX9-DL: ; %bb.0: ; %entry
752 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
753 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
754 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
755 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
756 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
757 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
758 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
759 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
760 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
761 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
762 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
763 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
764 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
765 ; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3]
766 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
767 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2
768 ; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
769 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
770 ; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
771 ; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4
772 ; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
773 ; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
774 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
775 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
776 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3
777 ; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 24, 4
778 ; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4
779 ; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4
780 ; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4
781 ; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4
782 ; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4
783 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
784 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
785 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
786 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
787 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
788 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
789 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
790 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
791 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
792 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
793 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
794 ; GFX9-DL-NEXT: s_endpgm
796 ; GFX10-DL-LABEL: udot8_acc8:
797 ; GFX10-DL: ; %bb.0: ; %entry
798 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
799 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
800 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
801 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
802 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
803 ; GFX10-DL-NEXT: s_clause 0x1
804 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
805 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
806 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
807 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
808 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
809 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
810 ; GFX10-DL-NEXT: s_clause 0x1
811 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
812 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
813 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
814 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
815 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
816 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
817 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
818 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
819 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
820 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
821 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
822 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
823 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
824 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
825 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
826 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
827 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
828 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
829 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
830 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
831 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
832 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
833 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
834 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
835 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
836 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
837 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
838 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
839 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
840 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
841 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
842 ; GFX10-DL-NEXT: s_endpgm
843 <8 x i4> addrspace(1)* %src2,
844 i8 addrspace(1)* nocapture %dst) {
846 %idx = call i32 @llvm.amdgcn.workitem.id.x()
847 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
848 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
849 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
850 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
852 %v1e0 = extractelement <8 x i4> %vec1, i64 0
853 %cv1e0 = zext i4 %v1e0 to i8
854 %v2e0 = extractelement <8 x i4> %vec2, i64 0
855 %cv2e0 = zext i4 %v2e0 to i8
856 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
858 %v1e1 = extractelement <8 x i4> %vec1, i64 1
859 %cv1e1 = zext i4 %v1e1 to i8
860 %v2e1 = extractelement <8 x i4> %vec2, i64 1
861 %cv2e1 = zext i4 %v2e1 to i8
862 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
864 %v1e2 = extractelement <8 x i4> %vec1, i64 2
865 %cv1e2 = zext i4 %v1e2 to i8
866 %v2e2 = extractelement <8 x i4> %vec2, i64 2
867 %cv2e2 = zext i4 %v2e2 to i8
868 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
870 %v1e3 = extractelement <8 x i4> %vec1, i64 3
871 %cv1e3 = zext i4 %v1e3 to i8
872 %v2e3 = extractelement <8 x i4> %vec2, i64 3
873 %cv2e3 = zext i4 %v2e3 to i8
874 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
876 %v1e4 = extractelement <8 x i4> %vec1, i64 4
877 %cv1e4 = zext i4 %v1e4 to i8
878 %v2e4 = extractelement <8 x i4> %vec2, i64 4
879 %cv2e4 = zext i4 %v2e4 to i8
880 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
882 %v1e5 = extractelement <8 x i4> %vec1, i64 5
883 %cv1e5 = zext i4 %v1e5 to i8
884 %v2e5 = extractelement <8 x i4> %vec2, i64 5
885 %cv2e5 = zext i4 %v2e5 to i8
886 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
888 %v1e6 = extractelement <8 x i4> %vec1, i64 6
889 %cv1e6 = zext i4 %v1e6 to i8
890 %v2e6 = extractelement <8 x i4> %vec2, i64 6
891 %cv2e6 = zext i4 %v2e6 to i8
892 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
894 %v1e7 = extractelement <8 x i4> %vec1, i64 7
895 %cv1e7 = zext i4 %v1e7 to i8
896 %v2e7 = extractelement <8 x i4> %vec2, i64 7
897 %cv2e7 = zext i4 %v2e7 to i8
898 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
900 %acc = load i8, i8 addrspace(1)* %dst, align 4
901 %add1 = add i8 %mul0, %acc
902 %add2 = add i8 %add1, %mul1
903 %add3 = add i8 %add2, %mul2
904 %add4 = add i8 %add3, %mul3
905 %add5 = add i8 %add4, %mul4
906 %add6 = add i8 %add5, %mul5
907 %add7 = add i8 %add6, %mul6
908 %add8 = add i8 %add7, %mul7
910 store i8 %add8, i8 addrspace(1)* %dst, align 4
914 ; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
915 ; to have the pattern-recognizer to kick in.
916 define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
917 ; GFX7-LABEL: udot8_acc4:
918 ; GFX7: ; %bb.0: ; %entry
919 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
920 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
921 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
922 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
923 ; GFX7-NEXT: s_mov_b32 s14, -1
924 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
925 ; GFX7-NEXT: s_add_u32 s12, s12, s3
926 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
927 ; GFX7-NEXT: s_mov_b32 s10, 0
928 ; GFX7-NEXT: s_mov_b32 s11, s3
929 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
930 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
931 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
932 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
933 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
934 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
935 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
936 ; GFX7-NEXT: s_mov_b32 s2, -1
937 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0
938 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
939 ; GFX7-NEXT: s_waitcnt vmcnt(2)
940 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
941 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
942 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
943 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
944 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
945 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
946 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
947 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
948 ; GFX7-NEXT: s_waitcnt vmcnt(1)
949 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
950 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
951 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
952 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
953 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
954 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
955 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
956 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
957 ; GFX7-NEXT: s_waitcnt vmcnt(0)
958 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
959 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
960 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
961 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
962 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
963 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
964 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
965 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
966 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
967 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
968 ; GFX7-NEXT: s_endpgm
970 ; GFX8-LABEL: udot8_acc4:
971 ; GFX8: ; %bb.0: ; %entry
972 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
973 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
974 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
975 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
976 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
977 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
978 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
979 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
980 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
981 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
982 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
983 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
984 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
985 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
986 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
987 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
988 ; GFX8-NEXT: flat_load_ubyte v18, v[2:3]
989 ; GFX8-NEXT: s_mov_b32 s10, -1
990 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
991 ; GFX8-NEXT: s_add_u32 s8, s8, s3
992 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
993 ; GFX8-NEXT: s_waitcnt vmcnt(2)
994 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4
995 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4
996 ; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4
997 ; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4
998 ; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4
999 ; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4
1000 ; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4
1001 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
1002 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1003 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0
1004 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v0
1005 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
1006 ; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4
1007 ; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4
1008 ; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4
1009 ; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4
1010 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1011 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1012 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18
1013 ; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0
1014 ; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0
1015 ; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0
1016 ; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0
1017 ; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0
1018 ; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0
1019 ; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0
1020 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1021 ; GFX8-NEXT: flat_store_byte v[2:3], v0
1022 ; GFX8-NEXT: s_endpgm
1024 ; GFX9-LABEL: udot8_acc4:
1025 ; GFX9: ; %bb.0: ; %entry
1026 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1027 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1028 ; GFX9-NEXT: s_mov_b32 s10, -1
1029 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1030 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1031 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1032 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1033 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1034 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1035 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1036 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1037 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
1038 ; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
1039 ; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3]
1040 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1041 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2
1042 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1043 ; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4
1044 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4
1045 ; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4
1046 ; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4
1047 ; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4
1048 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
1049 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1050 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3
1051 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3
1052 ; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4
1053 ; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4
1054 ; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4
1055 ; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4
1056 ; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4
1057 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
1058 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1059 ; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
1060 ; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
1061 ; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
1062 ; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
1063 ; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
1064 ; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
1065 ; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
1066 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
1067 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
1068 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
1069 ; GFX9-NEXT: s_endpgm
1071 ; GFX9-DL-LABEL: udot8_acc4:
1072 ; GFX9-DL: ; %bb.0: ; %entry
1073 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1074 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1075 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1076 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1077 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1078 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1079 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1080 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1081 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
1082 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1083 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1084 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
1085 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
1086 ; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3]
1087 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1088 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2
1089 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1090 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
1091 ; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
1092 ; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4
1093 ; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
1094 ; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
1095 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
1096 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1097 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3
1098 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v3
1099 ; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4
1100 ; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4
1101 ; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4
1102 ; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4
1103 ; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4
1104 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
1105 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1106 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
1107 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
1108 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
1109 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
1110 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
1111 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
1112 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
1113 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
1114 ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0
1115 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
1116 ; GFX9-DL-NEXT: s_endpgm
1118 ; GFX10-DL-LABEL: udot8_acc4:
1119 ; GFX10-DL: ; %bb.0: ; %entry
1120 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1121 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1122 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1123 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1124 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1125 ; GFX10-DL-NEXT: s_clause 0x1
1126 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1127 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1128 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1129 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1130 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1131 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1132 ; GFX10-DL-NEXT: s_clause 0x1
1133 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
1134 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
1135 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
1136 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1137 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
1138 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1139 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
1140 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
1141 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
1142 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1143 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
1144 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
1145 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
1146 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1147 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
1148 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
1149 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1150 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
1151 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
1152 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1153 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
1154 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
1155 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1156 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1157 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
1158 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
1159 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
1160 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1161 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1162 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
1163 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
1164 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
1165 ; GFX10-DL-NEXT: s_endpgm
1166 <8 x i4> addrspace(1)* %src2,
1167 i4 addrspace(1)* nocapture %dst) {
1169 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1170 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1171 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1172 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1173 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1175 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1176 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1177 %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1179 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1180 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1181 %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1183 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1184 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1185 %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1187 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1188 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1189 %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1191 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1192 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1193 %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1195 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1196 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1197 %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1199 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1200 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1201 %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1203 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1204 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1205 %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1207 %acc = load i4, i4 addrspace(1)* %dst, align 4
1208 %add1 = add i4 %mul0, %acc
1209 %add2 = add i4 %add1, %mul1
1210 %add3 = add i4 %add2, %mul2
1211 %add4 = add i4 %add3, %mul3
1212 %add5 = add i4 %add4, %mul4
1213 %add6 = add i4 %add5, %mul5
1214 %add7 = add i4 %add6, %mul6
1215 %add8 = add i4 %add7, %mul7
1217 store i4 %add8, i4 addrspace(1)* %dst, align 4
1221 ; TODO: Currently, permutation of udot8 is turned off due to a huge increase
1222 ; in the compile time.
1223 define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
1224 ; GFX7-LABEL: udot8_CommutationInsideMAD:
1225 ; GFX7: ; %bb.0: ; %entry
1226 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1227 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1228 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1229 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1230 ; GFX7-NEXT: s_mov_b32 s14, -1
1231 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1232 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1233 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1234 ; GFX7-NEXT: s_mov_b32 s10, 0
1235 ; GFX7-NEXT: s_mov_b32 s11, s3
1236 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1238 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1239 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1240 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1241 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1242 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1243 ; GFX7-NEXT: s_mov_b32 s2, -1
1244 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0
1245 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1246 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1247 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
1248 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
1249 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
1250 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
1251 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
1252 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
1253 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
1254 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
1255 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1256 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1257 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
1258 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
1259 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
1260 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
1261 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
1262 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
1263 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1264 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1265 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
1266 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
1267 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
1268 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
1269 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
1270 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
1271 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
1272 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
1273 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1274 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
1275 ; GFX7-NEXT: s_endpgm
1277 ; GFX8-LABEL: udot8_CommutationInsideMAD:
1278 ; GFX8: ; %bb.0: ; %entry
1279 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1280 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1281 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1282 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1283 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1284 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1285 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1286 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1287 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
1289 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1290 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1291 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1292 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1293 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1294 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1295 ; GFX8-NEXT: flat_load_ubyte v18, v[2:3]
1296 ; GFX8-NEXT: s_mov_b32 s10, -1
1297 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1298 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1299 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1300 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1301 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v4
1302 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4
1303 ; GFX8-NEXT: v_bfe_u32 v6, v4, 20, 4
1304 ; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 4
1305 ; GFX8-NEXT: v_bfe_u32 v8, v4, 12, 4
1306 ; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4
1307 ; GFX8-NEXT: v_bfe_u32 v10, v4, 4, 4
1308 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
1309 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1310 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v0
1311 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v0
1312 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
1313 ; GFX8-NEXT: v_bfe_u32 v14, v0, 16, 4
1314 ; GFX8-NEXT: v_bfe_u32 v15, v0, 12, 4
1315 ; GFX8-NEXT: v_bfe_u32 v16, v0, 8, 4
1316 ; GFX8-NEXT: v_bfe_u32 v17, v0, 4, 4
1317 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1318 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1319 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v18
1320 ; GFX8-NEXT: v_mad_u16 v0, v10, v17, v0
1321 ; GFX8-NEXT: v_mad_u16 v0, v9, v16, v0
1322 ; GFX8-NEXT: v_mad_u16 v0, v8, v15, v0
1323 ; GFX8-NEXT: v_mad_u16 v0, v7, v14, v0
1324 ; GFX8-NEXT: v_mad_u16 v0, v6, v13, v0
1325 ; GFX8-NEXT: v_mad_u16 v0, v5, v12, v0
1326 ; GFX8-NEXT: v_mad_u16 v0, v1, v11, v0
1327 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1328 ; GFX8-NEXT: flat_store_byte v[2:3], v0
1329 ; GFX8-NEXT: s_endpgm
1331 ; GFX9-LABEL: udot8_CommutationInsideMAD:
1332 ; GFX9: ; %bb.0: ; %entry
1333 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1334 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1335 ; GFX9-NEXT: s_mov_b32 s10, -1
1336 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1337 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1338 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1339 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1340 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1341 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1342 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1343 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1344 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
1345 ; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
1346 ; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3]
1347 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1348 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2
1349 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1350 ; GFX9-NEXT: v_bfe_u32 v5, v2, 20, 4
1351 ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 4
1352 ; GFX9-NEXT: v_bfe_u32 v7, v2, 12, 4
1353 ; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4
1354 ; GFX9-NEXT: v_bfe_u32 v9, v2, 4, 4
1355 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
1356 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1357 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v3
1358 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3
1359 ; GFX9-NEXT: v_bfe_u32 v12, v3, 20, 4
1360 ; GFX9-NEXT: v_bfe_u32 v13, v3, 16, 4
1361 ; GFX9-NEXT: v_bfe_u32 v14, v3, 12, 4
1362 ; GFX9-NEXT: v_bfe_u32 v15, v3, 8, 4
1363 ; GFX9-NEXT: v_bfe_u32 v16, v3, 4, 4
1364 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
1365 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1366 ; GFX9-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
1367 ; GFX9-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
1368 ; GFX9-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
1369 ; GFX9-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
1370 ; GFX9-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
1371 ; GFX9-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
1372 ; GFX9-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
1373 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
1374 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
1375 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
1376 ; GFX9-NEXT: s_endpgm
1378 ; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
1379 ; GFX9-DL: ; %bb.0: ; %entry
1380 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1381 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1382 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1383 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1384 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1385 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1386 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1387 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1388 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
1389 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1390 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1391 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
1392 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
1393 ; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3]
1394 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1395 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2
1396 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1397 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
1398 ; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
1399 ; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 12, 4
1400 ; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
1401 ; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
1402 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
1403 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1404 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v3
1405 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v3
1406 ; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 20, 4
1407 ; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 16, 4
1408 ; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 12, 4
1409 ; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 8, 4
1410 ; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 4, 4
1411 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
1412 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1413 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v17
1414 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v9, v16, v2
1415 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v8, v15, v2
1416 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v14, v2
1417 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v13, v2
1418 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v5, v12, v2
1419 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v4, v11, v2
1420 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v10, v2
1421 ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0
1422 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
1423 ; GFX9-DL-NEXT: s_endpgm
1425 ; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1426 ; GFX10-DL: ; %bb.0: ; %entry
1427 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1428 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1429 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1430 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1431 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1432 ; GFX10-DL-NEXT: s_clause 0x1
1433 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1434 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1435 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1436 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1437 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1438 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1439 ; GFX10-DL-NEXT: s_clause 0x1
1440 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
1441 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
1442 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
1443 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1444 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
1445 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1446 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
1447 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
1448 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
1449 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1450 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
1451 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
1452 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
1453 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1454 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
1455 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
1456 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1457 ; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
1458 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
1459 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1460 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
1461 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
1462 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1463 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
1464 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
1465 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
1466 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
1467 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
1468 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
1469 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
1470 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
1471 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
1472 ; GFX10-DL-NEXT: s_endpgm
1473 <8 x i4> addrspace(1)* %src2,
1474 i4 addrspace(1)* nocapture %dst) {
1476 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1477 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1478 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1479 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1480 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1482 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1483 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1484 %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1486 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1487 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1488 %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1490 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1491 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1492 %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1494 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1495 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1496 %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1498 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1499 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1500 %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1502 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1503 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1504 %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1506 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1507 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1508 %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1510 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1511 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1512 %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1514 %acc = load i4, i4 addrspace(1)* %dst, align 4
1515 %add1 = add i4 %mul0, %acc
1516 %add2 = add i4 %mul1, %add1
1517 %add3 = add i4 %mul2, %add2
1518 %add4 = add i4 %mul3, %add3
1519 %add5 = add i4 %mul4, %add4
1520 %add6 = add i4 %mul5, %add5
1521 %add7 = add i4 %mul6, %add6
1522 %add8 = add i4 %mul7, %add7
1524 store i4 %add8, i4 addrspace(1)* %dst, align 4
1528 define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1529 ; GFX7-LABEL: udot8_multiuses_mul1:
1530 ; GFX7: ; %bb.0: ; %entry
1531 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1532 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1533 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1534 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1535 ; GFX7-NEXT: s_mov_b32 s14, -1
1536 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1537 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1538 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1539 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1540 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1541 ; GFX7-NEXT: s_mov_b32 s10, 0
1542 ; GFX7-NEXT: s_mov_b32 s11, s3
1543 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1544 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1545 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1546 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1547 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1548 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1549 ; GFX7-NEXT: s_mov_b32 s2, -1
1550 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1551 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1552 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
1553 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
1554 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
1555 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
1556 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
1557 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
1558 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
1559 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
1560 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1561 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1562 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
1563 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
1564 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
1565 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
1566 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
1567 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
1568 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1569 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1570 ; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4
1571 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
1572 ; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16
1573 ; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2
1574 ; GFX7-NEXT: v_mad_u32_u24 v2, v6, v13, v2
1575 ; GFX7-NEXT: v_mad_u32_u24 v2, v5, v12, v2
1576 ; GFX7-NEXT: v_mad_u32_u24 v2, v4, v11, v2
1577 ; GFX7-NEXT: v_mad_u32_u24 v2, v3, v10, v2
1578 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v2
1579 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1580 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1581 ; GFX7-NEXT: s_endpgm
1583 ; GFX8-LABEL: udot8_multiuses_mul1:
1584 ; GFX8: ; %bb.0: ; %entry
1585 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1586 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1587 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1588 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1589 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1590 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1591 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1592 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1593 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1595 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
1596 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1597 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1598 ; GFX8-NEXT: flat_load_dword v1, v[2:3]
1599 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1600 ; GFX8-NEXT: s_mov_b32 s10, -1
1601 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1602 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1603 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1604 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1605 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0
1606 ; GFX8-NEXT: v_bfe_u32 v3, v0, 24, 4
1607 ; GFX8-NEXT: v_bfe_u32 v4, v0, 20, 4
1608 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 4
1609 ; GFX8-NEXT: v_bfe_u32 v6, v0, 12, 4
1610 ; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4
1611 ; GFX8-NEXT: v_bfe_u32 v8, v0, 4, 4
1612 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1613 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1614 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v1
1615 ; GFX8-NEXT: v_bfe_u32 v10, v1, 24, 4
1616 ; GFX8-NEXT: v_bfe_u32 v11, v1, 20, 4
1617 ; GFX8-NEXT: v_bfe_u32 v12, v1, 16, 4
1618 ; GFX8-NEXT: v_bfe_u32 v13, v1, 12, 4
1619 ; GFX8-NEXT: v_bfe_u32 v14, v1, 8, 4
1620 ; GFX8-NEXT: v_bfe_u32 v15, v1, 4, 4
1621 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1
1622 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1623 ; GFX8-NEXT: v_mad_u32_u24 v16, v0, v1, s2
1624 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v16
1625 ; GFX8-NEXT: v_mad_u32_u24 v1, v8, v15, v16
1626 ; GFX8-NEXT: v_mad_u32_u24 v1, v7, v14, v1
1627 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v13, v1
1628 ; GFX8-NEXT: v_mad_u32_u24 v1, v5, v12, v1
1629 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v11, v1
1630 ; GFX8-NEXT: v_mad_u32_u24 v1, v3, v10, v1
1631 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v9, v1
1632 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
1633 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1634 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1635 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1636 ; GFX8-NEXT: s_endpgm
1638 ; GFX9-LABEL: udot8_multiuses_mul1:
1639 ; GFX9: ; %bb.0: ; %entry
1640 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1641 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1642 ; GFX9-NEXT: s_mov_b32 s10, -1
1643 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1644 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1645 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1646 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1647 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1648 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1650 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1651 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1652 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1653 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1654 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1655 ; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4
1656 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1657 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1658 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1659 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
1660 ; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
1661 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
1662 ; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
1663 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
1664 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
1665 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
1666 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
1667 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
1668 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
1669 ; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4
1670 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
1671 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
1672 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2
1673 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1674 ; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0
1675 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16
1676 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
1677 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1
1678 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
1679 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
1680 ; GFX9-NEXT: v_add3_u32 v2, v2, v9, v8
1681 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
1682 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
1683 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6
1684 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4
1685 ; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2
1686 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
1687 ; GFX9-NEXT: s_endpgm
1689 ; GFX9-DL-LABEL: udot8_multiuses_mul1:
1690 ; GFX9-DL: ; %bb.0: ; %entry
1691 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1692 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1693 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1694 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1695 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1696 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1697 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1698 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1699 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1700 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1701 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1702 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1703 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1704 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1705 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1706 ; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4
1707 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1708 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1709 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1710 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
1711 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
1712 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
1713 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
1714 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
1715 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
1716 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
1717 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
1718 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
1719 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
1720 ; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
1721 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
1722 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
1723 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2
1724 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1725 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
1726 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16
1727 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15
1728 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1
1729 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14
1730 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13
1731 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v9, v8
1732 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12
1733 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11
1734 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6
1735 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4
1736 ; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2
1737 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1738 ; GFX9-DL-NEXT: s_endpgm
1740 ; GFX10-DL-LABEL: udot8_multiuses_mul1:
1741 ; GFX10-DL: ; %bb.0: ; %entry
1742 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1743 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1744 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1745 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1746 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1747 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1748 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1749 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1750 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1751 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1752 ; GFX10-DL-NEXT: s_clause 0x1
1753 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1754 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1755 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1756 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1757 ; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1
1758 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1759 ; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2
1760 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
1761 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1
1762 ; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4
1763 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4
1764 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4
1765 ; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
1766 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
1767 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4
1768 ; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
1769 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
1770 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1771 ; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2
1772 ; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4
1773 ; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
1774 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11
1775 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12
1776 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13
1777 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1778 ; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
1779 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15
1780 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14
1781 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7
1782 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2
1783 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10
1784 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9
1785 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5
1786 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
1787 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1788 ; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
1789 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
1790 ; GFX10-DL-NEXT: s_endpgm
1791 <8 x i4> addrspace(1)* %src2,
1792 i32 addrspace(1)* nocapture %dst) {
1794 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1795 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1796 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1797 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1798 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1800 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1801 %cv1e0 = zext i4 %v1e0 to i32
1802 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1803 %cv2e0 = zext i4 %v2e0 to i32
1804 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1806 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1807 %cv1e1 = zext i4 %v1e1 to i32
1808 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1809 %cv2e1 = zext i4 %v2e1 to i32
1810 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1812 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1813 %cv1e2 = zext i4 %v1e2 to i32
1814 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1815 %cv2e2 = zext i4 %v2e2 to i32
1816 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1818 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1819 %cv1e3 = zext i4 %v1e3 to i32
1820 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1821 %cv2e3 = zext i4 %v2e3 to i32
1822 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1824 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1825 %cv1e4 = zext i4 %v1e4 to i32
1826 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1827 %cv2e4 = zext i4 %v2e4 to i32
1828 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1830 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1831 %cv1e5 = zext i4 %v1e5 to i32
1832 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1833 %cv2e5 = zext i4 %v2e5 to i32
1834 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1836 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1837 %cv1e6 = zext i4 %v1e6 to i32
1838 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1839 %cv2e6 = zext i4 %v2e6 to i32
1840 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1842 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1843 %cv1e7 = zext i4 %v1e7 to i32
1844 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1845 %cv2e7 = zext i4 %v2e7 to i32
1846 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1848 %acc = load i32, i32 addrspace(1)* %dst, align 4
1849 %add1 = add i32 %mul0, %acc
1850 %add = add i32 %mul0, %add1
1851 %add2 = add i32 %add1, %mul1
1852 %add3 = add i32 %add2, %mul2
1853 %add4 = add i32 %add3, %mul3
1854 %add5 = add i32 %add4, %mul4
1855 %add6 = add i32 %add5, %mul5
1856 %add7 = add i32 %add6, %mul6
1857 %add8 = add i32 %add7, %mul7
1859 %res = add i32 %add, %add8
1860 store i32 %res, i32 addrspace(1)* %dst, align 4
1864 define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1865 ; GFX7-LABEL: udot8_acc32_vecMul:
1866 ; GFX7: ; %bb.0: ; %entry
1867 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1868 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1869 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1870 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1871 ; GFX7-NEXT: s_mov_b32 s14, -1
1872 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1873 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1874 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1875 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1876 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1877 ; GFX7-NEXT: s_mov_b32 s10, 0
1878 ; GFX7-NEXT: s_mov_b32 s11, s3
1879 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1880 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1881 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1882 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1883 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1884 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1885 ; GFX7-NEXT: s_mov_b32 s2, -1
1886 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1887 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1888 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
1889 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
1890 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
1891 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
1892 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
1893 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
1894 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
1895 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
1896 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1897 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
1898 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
1899 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
1900 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
1901 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
1902 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
1903 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
1904 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
1905 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1906 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
1907 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
1908 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
1909 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
1910 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
1911 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
1912 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
1913 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
1914 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1915 ; GFX7-NEXT: s_endpgm
1917 ; GFX8-LABEL: udot8_acc32_vecMul:
1918 ; GFX8: ; %bb.0: ; %entry
1919 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1920 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1921 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1922 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1923 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1924 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1925 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1926 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1927 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1928 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1929 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
1930 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1931 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1932 ; GFX8-NEXT: flat_load_dword v1, v[2:3]
1933 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1934 ; GFX8-NEXT: s_mov_b32 s10, -1
1935 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1936 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1937 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1938 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1939 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0
1940 ; GFX8-NEXT: v_bfe_u32 v3, v0, 24, 4
1941 ; GFX8-NEXT: v_bfe_u32 v4, v0, 20, 4
1942 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 4
1943 ; GFX8-NEXT: v_bfe_u32 v6, v0, 12, 4
1944 ; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4
1945 ; GFX8-NEXT: v_bfe_u32 v8, v0, 4, 4
1946 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
1947 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1948 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v1
1949 ; GFX8-NEXT: v_bfe_u32 v10, v1, 24, 4
1950 ; GFX8-NEXT: v_bfe_u32 v11, v1, 20, 4
1951 ; GFX8-NEXT: v_bfe_u32 v12, v1, 16, 4
1952 ; GFX8-NEXT: v_bfe_u32 v13, v1, 12, 4
1953 ; GFX8-NEXT: v_bfe_u32 v14, v1, 8, 4
1954 ; GFX8-NEXT: v_bfe_u32 v15, v1, 4, 4
1955 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1
1956 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1957 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, s2
1958 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
1959 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
1960 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
1961 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0
1962 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
1963 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v10, v0
1964 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v9, v0
1965 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1966 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1967 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1968 ; GFX8-NEXT: s_endpgm
1970 ; GFX9-LABEL: udot8_acc32_vecMul:
1971 ; GFX9: ; %bb.0: ; %entry
1972 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1973 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1974 ; GFX9-NEXT: s_mov_b32 s10, -1
1975 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1976 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1977 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1978 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1979 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1980 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1981 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1983 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1984 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1985 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1986 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1987 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
1988 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1989 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1990 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4
1991 ; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4
1992 ; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4
1993 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
1994 ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4
1995 ; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4
1996 ; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
1997 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
1998 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
1999 ; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4
2000 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
2001 ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4
2002 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
2003 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
2004 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
2005 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16
2006 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
2007 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
2008 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2009 ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
2010 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
2011 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
2012 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
2013 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
2014 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10
2015 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
2016 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
2017 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
2018 ; GFX9-NEXT: s_endpgm
2020 ; GFX9-DL-LABEL: udot8_acc32_vecMul:
2021 ; GFX9-DL: ; %bb.0: ; %entry
2022 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2023 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2024 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2025 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2026 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2027 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2028 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2029 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2030 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
2031 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2032 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2033 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
2034 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
2035 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
2036 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2037 ; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v2, v3, s0
2038 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
2039 ; GFX9-DL-NEXT: s_endpgm
2041 ; GFX10-DL-LABEL: udot8_acc32_vecMul:
2042 ; GFX10-DL: ; %bb.0: ; %entry
2043 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2044 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2045 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2046 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2047 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2048 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2049 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2050 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2051 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2052 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2053 ; GFX10-DL-NEXT: s_clause 0x1
2054 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2055 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2056 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2057 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
2058 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2059 ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
2060 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
2061 ; GFX10-DL-NEXT: s_endpgm
2062 <8 x i4> addrspace(1)* %src2,
2063 i32 addrspace(1)* nocapture %dst) {
2065 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2066 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2067 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2068 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2069 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2071 %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
2072 %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
2074 %mul = mul <8 x i32> %cvec1, %cvec2
2075 %mul0 = extractelement <8 x i32> %mul, i64 0
2076 %mul1 = extractelement <8 x i32> %mul, i64 1
2077 %mul2 = extractelement <8 x i32> %mul, i64 2
2078 %mul3 = extractelement <8 x i32> %mul, i64 3
2079 %mul4 = extractelement <8 x i32> %mul, i64 4
2080 %mul5 = extractelement <8 x i32> %mul, i64 5
2081 %mul6 = extractelement <8 x i32> %mul, i64 6
2082 %mul7 = extractelement <8 x i32> %mul, i64 7
2084 %acc = load i32, i32 addrspace(1)* %dst, align 4
2085 %add1 = add i32 %mul0, %acc
2086 %add2 = add i32 %add1, %mul1
2087 %add3 = add i32 %add2, %mul2
2088 %add4 = add i32 %add3, %mul3
2089 %add5 = add i32 %add4, %mul4
2090 %add6 = add i32 %add5, %mul5
2091 %add7 = add i32 %add6, %mul6
2092 %add8 = add i32 %add7, %mul7
2094 store i32 %add8, i32 addrspace(1)* %dst, align 4
2098 ; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
2099 ; support the pattern.
2100 define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2101 ; GFX7-LABEL: udot8_acc16_vecMul:
2102 ; GFX7: ; %bb.0: ; %entry
2103 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2104 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2105 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2106 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2107 ; GFX7-NEXT: s_mov_b32 s14, -1
2108 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2109 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2110 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2111 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2112 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2113 ; GFX7-NEXT: s_mov_b32 s10, 0
2114 ; GFX7-NEXT: s_mov_b32 s11, s3
2115 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2116 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2117 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2118 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2119 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2120 ; GFX7-NEXT: s_mov_b32 s2, -1
2121 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0
2122 ; GFX7-NEXT: s_mov_b32 s4, 0xf0000
2123 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2124 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2125 ; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4
2126 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v2
2127 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
2128 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
2129 ; GFX7-NEXT: v_bfe_u32 v4, v2, 12, 4
2130 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4
2131 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v2
2132 ; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16
2133 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v8
2134 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2135 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v0
2136 ; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
2137 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v8
2138 ; GFX7-NEXT: v_and_b32_e32 v13, 15, v0
2139 ; GFX7-NEXT: v_or_b32_e32 v7, v13, v7
2140 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v6
2141 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v7
2142 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v6
2143 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v7
2144 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2145 ; GFX7-NEXT: v_mad_u32_u24 v6, v6, v7, v16
2146 ; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4
2147 ; GFX7-NEXT: v_mad_u32_u24 v6, v8, v13, v6
2148 ; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4
2149 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
2150 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
2151 ; GFX7-NEXT: v_bfe_u32 v11, v0, 12, 4
2152 ; GFX7-NEXT: v_alignbit_b32 v0, v14, v0, 16
2153 ; GFX7-NEXT: v_mad_u32_u24 v5, v5, v12, v6
2154 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v2
2155 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v0
2156 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
2157 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2158 ; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v5
2159 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4
2160 ; GFX7-NEXT: v_mad_u32_u24 v0, v15, v14, v0
2161 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2162 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
2163 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2164 ; GFX7-NEXT: s_endpgm
2166 ; GFX8-LABEL: udot8_acc16_vecMul:
2167 ; GFX8: ; %bb.0: ; %entry
2168 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2169 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2170 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2171 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2172 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2173 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2174 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2175 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2176 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2177 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
2178 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2179 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2180 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2181 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
2182 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2183 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
2184 ; GFX8-NEXT: flat_load_ushort v18, v[2:3]
2185 ; GFX8-NEXT: s_mov_b32 s10, -1
2186 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2187 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2188 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2189 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2190 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v4
2191 ; GFX8-NEXT: v_bfe_u32 v5, v4, 4, 4
2192 ; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4
2193 ; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4
2194 ; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4
2195 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2196 ; GFX8-NEXT: v_and_b32_e32 v11, 15, v0
2197 ; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4
2198 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2199 ; GFX8-NEXT: v_mad_u16 v1, v1, v11, v18
2200 ; GFX8-NEXT: v_bfe_u32 v13, v0, 8, 4
2201 ; GFX8-NEXT: v_mad_u16 v1, v5, v12, v1
2202 ; GFX8-NEXT: v_bfe_u32 v14, v0, 12, 4
2203 ; GFX8-NEXT: v_mad_u16 v1, v6, v13, v1
2204 ; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4
2205 ; GFX8-NEXT: v_mad_u16 v1, v7, v14, v1
2206 ; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4
2207 ; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4
2208 ; GFX8-NEXT: v_mad_u16 v1, v8, v15, v1
2209 ; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4
2210 ; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4
2211 ; GFX8-NEXT: v_mad_u16 v1, v9, v16, v1
2212 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4
2213 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
2214 ; GFX8-NEXT: v_mad_u16 v1, v10, v17, v1
2215 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1
2216 ; GFX8-NEXT: flat_store_short v[2:3], v0
2217 ; GFX8-NEXT: s_endpgm
2219 ; GFX9-LABEL: udot8_acc16_vecMul:
2220 ; GFX9: ; %bb.0: ; %entry
2221 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2222 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2223 ; GFX9-NEXT: s_mov_b32 s10, -1
2224 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2225 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2226 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2227 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2228 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2229 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
2230 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2231 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2232 ; GFX9-NEXT: global_load_dword v3, v0, s[4:5]
2233 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
2234 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2235 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2236 ; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4
2237 ; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4
2238 ; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4
2239 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2240 ; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4
2241 ; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4
2242 ; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4
2243 ; GFX9-NEXT: v_and_b32_e32 v17, 15, v4
2244 ; GFX9-NEXT: v_and_b32_e32 v10, 15, v3
2245 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3
2246 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
2247 ; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4
2248 ; GFX9-NEXT: v_and_b32_e32 v6, v2, v6
2249 ; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4
2250 ; GFX9-NEXT: v_and_b32_e32 v8, v2, v8
2251 ; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4
2252 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4
2253 ; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4
2254 ; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4
2255 ; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4
2256 ; GFX9-NEXT: v_and_b32_e32 v17, v2, v17
2257 ; GFX9-NEXT: v_and_b32_e32 v11, v2, v11
2258 ; GFX9-NEXT: v_and_b32_e32 v13, v2, v13
2259 ; GFX9-NEXT: v_and_b32_e32 v15, v2, v15
2260 ; GFX9-NEXT: v_and_b32_e32 v2, v2, v10
2261 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17
2262 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
2263 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v4
2264 ; GFX9-NEXT: global_load_ushort v4, v1, s[2:3]
2265 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
2266 ; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6
2267 ; GFX9-NEXT: v_lshl_or_b32 v5, v14, 16, v13
2268 ; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v15
2269 ; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8
2270 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v6, v5
2271 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v7
2272 ; GFX9-NEXT: v_lshl_or_b32 v10, v12, 16, v11
2273 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v10
2274 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2275 ; GFX9-NEXT: v_add_u16_e32 v4, v2, v4
2276 ; GFX9-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2277 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v5
2278 ; GFX9-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2279 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v3
2280 ; GFX9-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2281 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v0
2282 ; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2283 ; GFX9-NEXT: global_store_short v1, v0, s[2:3]
2284 ; GFX9-NEXT: s_endpgm
2286 ; GFX9-DL-LABEL: udot8_acc16_vecMul:
2287 ; GFX9-DL: ; %bb.0: ; %entry
2288 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2289 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2290 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2291 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2292 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2293 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2294 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2295 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2296 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff
2297 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
2298 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2299 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
2300 ; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7]
2301 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2302 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2303 ; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4
2304 ; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4
2305 ; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4
2306 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2307 ; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4
2308 ; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4
2309 ; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4
2310 ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4
2311 ; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3
2312 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3
2313 ; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0
2314 ; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
2315 ; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6
2316 ; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4
2317 ; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8
2318 ; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4
2319 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4
2320 ; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4
2321 ; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4
2322 ; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4
2323 ; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17
2324 ; GFX9-DL-NEXT: v_and_b32_e32 v11, v2, v11
2325 ; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13
2326 ; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15
2327 ; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v10
2328 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17
2329 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v3, 16, v2
2330 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4
2331 ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3]
2332 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0
2333 ; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6
2334 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v14, 16, v13
2335 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v16, 16, v15
2336 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8
2337 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v6, v5
2338 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v7
2339 ; GFX9-DL-NEXT: v_lshl_or_b32 v10, v12, 16, v11
2340 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v10
2341 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2342 ; GFX9-DL-NEXT: v_add_u16_e32 v4, v2, v4
2343 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2344 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v5
2345 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2346 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v3
2347 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2348 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0
2349 ; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2350 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3]
2351 ; GFX9-DL-NEXT: s_endpgm
2353 ; GFX10-DL-LABEL: udot8_acc16_vecMul:
2354 ; GFX10-DL: ; %bb.0: ; %entry
2355 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2356 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2357 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2358 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
2359 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2360 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2361 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2362 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2363 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2364 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2365 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2366 ; GFX10-DL-NEXT: s_clause 0x1
2367 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2368 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2369 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2370 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
2371 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2372 ; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1
2373 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2374 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2
2375 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4
2376 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
2377 ; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4
2378 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
2379 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
2380 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
2381 ; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
2382 ; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12
2383 ; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7
2384 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
2385 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
2386 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
2387 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13
2388 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
2389 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6
2390 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4
2391 ; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12
2392 ; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
2393 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
2394 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6
2395 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2396 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3
2397 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4
2398 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11
2399 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10
2400 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
2401 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12
2402 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
2403 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4
2404 ; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11
2405 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
2406 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
2407 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2408 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9
2409 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10
2410 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5
2411 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6
2412 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
2413 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9
2414 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4
2415 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
2416 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1
2417 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2
2418 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
2419 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2420 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
2421 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
2422 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
2423 ; GFX10-DL-NEXT: s_endpgm
2424 <8 x i4> addrspace(1)* %src2,
2425 i16 addrspace(1)* nocapture %dst) {
2427 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2428 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2429 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2430 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2431 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2433 %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
2434 %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
2436 %mul = mul <8 x i16> %cvec1, %cvec2
2437 %mul0 = extractelement <8 x i16> %mul, i64 0
2438 %mul1 = extractelement <8 x i16> %mul, i64 1
2439 %mul2 = extractelement <8 x i16> %mul, i64 2
2440 %mul3 = extractelement <8 x i16> %mul, i64 3
2441 %mul4 = extractelement <8 x i16> %mul, i64 4
2442 %mul5 = extractelement <8 x i16> %mul, i64 5
2443 %mul6 = extractelement <8 x i16> %mul, i64 6
2444 %mul7 = extractelement <8 x i16> %mul, i64 7
2446 %acc = load i16, i16 addrspace(1)* %dst, align 4
2447 %add1 = add i16 %mul0, %acc
2448 %add2 = add i16 %add1, %mul1
2449 %add3 = add i16 %add2, %mul2
2450 %add4 = add i16 %add3, %mul3
2451 %add5 = add i16 %add4, %mul4
2452 %add6 = add i16 %add5, %mul5
2453 %add7 = add i16 %add6, %mul6
2454 %add8 = add i16 %add7, %mul7
2456 store i16 %add8, i16 addrspace(1)* %dst, align 4
2460 ; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
2461 define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2462 ; GFX7-LABEL: udot8_acc8_vecMul:
2463 ; GFX7: ; %bb.0: ; %entry
2464 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2465 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2466 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2467 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2468 ; GFX7-NEXT: s_mov_b32 s14, -1
2469 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2470 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2471 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2472 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2473 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2474 ; GFX7-NEXT: s_mov_b32 s10, 0
2475 ; GFX7-NEXT: s_mov_b32 s11, s3
2476 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2477 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2478 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2479 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2480 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2481 ; GFX7-NEXT: s_mov_b32 s2, -1
2482 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0
2483 ; GFX7-NEXT: s_movk_i32 s4, 0xf00
2484 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xf00
2485 ; GFX7-NEXT: s_movk_i32 s5, 0xf0f
2486 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2487 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2488 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2
2489 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2
2490 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2
2491 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2
2492 ; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 4
2493 ; GFX7-NEXT: v_and_b32_e32 v5, 15, v2
2494 ; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4
2495 ; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24
2496 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v9
2497 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2498 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0
2499 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0
2500 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
2501 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
2502 ; GFX7-NEXT: v_and_b32_e32 v6, v3, v9
2503 ; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4
2504 ; GFX7-NEXT: v_and_b32_e32 v3, v3, v11
2505 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
2506 ; GFX7-NEXT: v_or_b32_e32 v3, v10, v3
2507 ; GFX7-NEXT: v_and_b32_e32 v12, 15, v0
2508 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2509 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0
2510 ; GFX7-NEXT: v_or_b32_e32 v6, v12, v6
2511 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2512 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v2
2513 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3
2514 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0
2515 ; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4
2516 ; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24
2517 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
2518 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1
2519 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v15
2520 ; GFX7-NEXT: v_and_b32_e32 v0, s5, v0
2521 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
2522 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2523 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v1
2524 ; GFX7-NEXT: v_and_b32_e32 v12, 15, v3
2525 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
2526 ; GFX7-NEXT: v_or_b32_e32 v4, v14, v4
2527 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2528 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 4
2529 ; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 4
2530 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2531 ; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16
2532 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
2533 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1
2534 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3
2535 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 4
2536 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4
2537 ; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6
2538 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v6
2539 ; GFX7-NEXT: v_and_b32_e32 v8, 15, v2
2540 ; GFX7-NEXT: v_and_b32_e32 v14, 15, v0
2541 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v10, v1
2542 ; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 4
2543 ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
2544 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
2545 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2
2546 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0
2547 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4
2548 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4
2549 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1
2550 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2551 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v11, v0
2552 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2553 ; GFX7-NEXT: s_endpgm
2555 ; GFX8-LABEL: udot8_acc8_vecMul:
2556 ; GFX8: ; %bb.0: ; %entry
2557 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2558 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2559 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2560 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2561 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2562 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2563 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2564 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2565 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2566 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
2567 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
2568 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2569 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
2570 ; GFX8-NEXT: flat_load_dword v2, v[2:3]
2571 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2572 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2573 ; GFX8-NEXT: flat_load_ubyte v5, v[0:1]
2574 ; GFX8-NEXT: s_mov_b32 s10, -1
2575 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2576 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2577 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2578 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2579 ; GFX8-NEXT: v_bfe_u32 v3, v4, 20, 4
2580 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2581 ; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4
2582 ; GFX8-NEXT: v_bfe_u32 v7, v4, 24, 4
2583 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v4
2584 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2
2585 ; GFX8-NEXT: v_bfe_u32 v14, v2, 24, 4
2586 ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2587 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 4
2588 ; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4
2589 ; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4
2590 ; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4
2591 ; GFX8-NEXT: v_bfe_u32 v10, v4, 12, 4
2592 ; GFX8-NEXT: v_and_b32_e32 v11, 15, v4
2593 ; GFX8-NEXT: v_bfe_u32 v17, v2, 12, 4
2594 ; GFX8-NEXT: v_and_b32_e32 v18, 15, v2
2595 ; GFX8-NEXT: v_bfe_u32 v4, v4, 4, 4
2596 ; GFX8-NEXT: v_bfe_u32 v2, v2, 4, 4
2597 ; GFX8-NEXT: v_mul_lo_u16_e32 v13, v7, v14
2598 ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2599 ; GFX8-NEXT: v_mul_lo_u16_e32 v19, v6, v12
2600 ; GFX8-NEXT: v_mul_lo_u16_e32 v9, v9, v16
2601 ; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2602 ; GFX8-NEXT: v_mul_lo_u16_e32 v11, v11, v18
2603 ; GFX8-NEXT: v_mul_lo_u16_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2604 ; GFX8-NEXT: v_or_b32_e32 v8, v13, v8
2605 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v10
2606 ; GFX8-NEXT: v_or_b32_e32 v10, v11, v4
2607 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v8
2608 ; GFX8-NEXT: v_or_b32_e32 v3, v19, v3
2609 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2610 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v9
2611 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v2
2612 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3
2613 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
2614 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v4
2615 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2616 ; GFX8-NEXT: v_add_u16_e32 v3, v10, v5
2617 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4
2618 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v9
2619 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
2620 ; GFX8-NEXT: v_mad_u16 v2, v6, v12, v2
2621 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v11
2622 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8
2623 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
2624 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
2625 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2626 ; GFX8-NEXT: s_endpgm
2628 ; GFX9-LABEL: udot8_acc8_vecMul:
2629 ; GFX9: ; %bb.0: ; %entry
2630 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2631 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2632 ; GFX9-NEXT: s_mov_b32 s10, -1
2633 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2634 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2635 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2636 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2637 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2638 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2639 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2640 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2641 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2642 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2643 ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]
2644 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2645 ; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4
2646 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2647 ; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
2648 ; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4
2649 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 28, v1
2650 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2
2651 ; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4
2652 ; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2653 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4
2654 ; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4
2655 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
2656 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
2657 ; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4
2658 ; GFX9-NEXT: v_and_b32_e32 v10, 15, v1
2659 ; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4
2660 ; GFX9-NEXT: v_and_b32_e32 v17, 15, v2
2661 ; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4
2662 ; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4
2663 ; GFX9-NEXT: v_mul_lo_u16_e32 v12, v6, v13
2664 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2665 ; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11
2666 ; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2667 ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15
2668 ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2669 ; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17
2670 ; GFX9-NEXT: v_or_b32_e32 v7, v12, v7
2671 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9
2672 ; GFX9-NEXT: v_or_b32_e32 v1, v18, v0
2673 ; GFX9-NEXT: v_or_b32_e32 v9, v10, v2
2674 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7
2675 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2676 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8
2677 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
2678 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1
2679 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
2680 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2
2681 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2682 ; GFX9-NEXT: v_add_u16_e32 v1, v9, v4
2683 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v2
2684 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v8
2685 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
2686 ; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0
2687 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v10
2688 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2689 ; GFX9-NEXT: v_mad_legacy_u16 v0, v6, v13, v0
2690 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v7
2691 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
2692 ; GFX9-NEXT: s_endpgm
2694 ; GFX9-DL-LABEL: udot8_acc8_vecMul:
2695 ; GFX9-DL: ; %bb.0: ; %entry
2696 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2697 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2698 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2699 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2700 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2701 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2702 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2703 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2704 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
2705 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2706 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2707 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2708 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2709 ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]
2710 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2711 ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4
2712 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2713 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4
2714 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4
2715 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
2716 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
2717 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4
2718 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2719 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
2720 ; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4
2721 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
2722 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4
2723 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
2724 ; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1
2725 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4
2726 ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2
2727 ; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4
2728 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4
2729 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v12, v6, v13
2730 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2731 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11
2732 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2733 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15
2734 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2735 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17
2736 ; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v7
2737 ; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9
2738 ; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v0
2739 ; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2
2740 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v10, 16, v7
2741 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2742 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8
2743 ; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0
2744 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1
2745 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
2746 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2
2747 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2748 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v9, v4
2749 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2
2750 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8
2751 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
2752 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0
2753 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10
2754 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2755 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v13, v0
2756 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7
2757 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
2758 ; GFX9-DL-NEXT: s_endpgm
2760 ; GFX10-DL-LABEL: udot8_acc8_vecMul:
2761 ; GFX10-DL: ; %bb.0: ; %entry
2762 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2763 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2764 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2765 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
2766 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2767 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2768 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2769 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2770 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2771 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2772 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2773 ; GFX10-DL-NEXT: s_clause 0x1
2774 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2775 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2776 ; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
2777 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2778 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
2779 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2780 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
2781 ; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
2782 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
2783 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
2784 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
2785 ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v9, v10
2786 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
2787 ; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13
2788 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4
2789 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4
2790 ; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1
2791 ; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9
2792 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4
2793 ; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4
2794 ; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14
2795 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4
2796 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4
2797 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4
2798 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
2799 ; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15
2800 ; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9
2801 ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v0, v10
2802 ; GFX10-DL-NEXT: v_mul_lo_u16 v10, v6, v13
2803 ; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7
2804 ; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2
2805 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1
2806 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8
2807 ; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12
2808 ; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7
2809 ; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9
2810 ; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2811 ; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
2812 ; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9
2813 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v7
2814 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10
2815 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2816 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3
2817 ; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2818 ; GFX10-DL-NEXT: v_add_nc_u16 v9, v3, v10
2819 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
2820 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
2821 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8
2822 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
2823 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0
2824 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
2825 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7
2826 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0
2827 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
2828 ; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
2829 ; GFX10-DL-NEXT: s_endpgm
2830 <8 x i4> addrspace(1)* %src2,
2831 i8 addrspace(1)* nocapture %dst) {
2833 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2834 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2835 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2836 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2837 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2839 %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
2840 %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
2842 %mul = mul <8 x i8> %cvec1, %cvec2
2843 %mul0 = extractelement <8 x i8> %mul, i64 0
2844 %mul1 = extractelement <8 x i8> %mul, i64 1
2845 %mul2 = extractelement <8 x i8> %mul, i64 2
2846 %mul3 = extractelement <8 x i8> %mul, i64 3
2847 %mul4 = extractelement <8 x i8> %mul, i64 4
2848 %mul5 = extractelement <8 x i8> %mul, i64 5
2849 %mul6 = extractelement <8 x i8> %mul, i64 6
2850 %mul7 = extractelement <8 x i8> %mul, i64 7
2852 %acc = load i8, i8 addrspace(1)* %dst, align 4
2853 %add1 = add i8 %mul0, %acc
2854 %add2 = add i8 %add1, %mul1
2855 %add3 = add i8 %add2, %mul2
2856 %add4 = add i8 %add3, %mul3
2857 %add5 = add i8 %add4, %mul4
2858 %add6 = add i8 %add5, %mul5
2859 %add7 = add i8 %add6, %mul6
2860 %add8 = add i8 %add7, %mul7
2862 store i8 %add8, i8 addrspace(1)* %dst, align 4
2866 ; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
2867 define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
2868 ; GFX7-LABEL: udot8_acc4_vecMul:
2869 ; GFX7: ; %bb.0: ; %entry
2870 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2871 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2872 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2873 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2874 ; GFX7-NEXT: s_mov_b32 s14, -1
2875 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2876 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2877 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2878 ; GFX7-NEXT: s_mov_b32 s10, 0
2879 ; GFX7-NEXT: s_mov_b32 s11, s3
2880 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2881 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2882 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2883 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2884 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2885 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2886 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2887 ; GFX7-NEXT: s_mov_b32 s2, -1
2888 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0
2889 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2890 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2891 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
2892 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
2893 ; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
2894 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
2895 ; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
2896 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
2897 ; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
2898 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
2899 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2900 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
2901 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
2902 ; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
2903 ; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
2904 ; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
2905 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
2906 ; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
2907 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2908 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2909 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
2910 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
2911 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
2912 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
2913 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
2914 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2915 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2916 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
2917 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
2918 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2919 ; GFX7-NEXT: s_endpgm
2921 ; GFX8-LABEL: udot8_acc4_vecMul:
2922 ; GFX8: ; %bb.0: ; %entry
2923 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2924 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2925 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2926 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2927 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2928 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2929 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2930 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2931 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2932 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
2933 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2934 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2935 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2936 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2937 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
2938 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
2939 ; GFX8-NEXT: s_mov_b32 s10, -1
2940 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2941 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2942 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2943 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2944 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v4
2945 ; GFX8-NEXT: v_bfe_u32 v5, v4, 4, 4
2946 ; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4
2947 ; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4
2948 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2949 ; GFX8-NEXT: v_and_b32_e32 v11, 15, v0
2950 ; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4
2951 ; GFX8-NEXT: v_bfe_u32 v13, v0, 8, 4
2952 ; GFX8-NEXT: v_bfe_u32 v14, v0, 12, 4
2953 ; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4
2954 ; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4
2955 ; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4
2956 ; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4
2957 ; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4
2958 ; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4
2959 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4
2960 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
2961 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, v4, v0
2962 ; GFX8-NEXT: v_mul_u32_u24_e32 v4, v10, v17
2963 ; GFX8-NEXT: flat_load_ubyte v10, v[2:3]
2964 ; GFX8-NEXT: v_mul_u32_u24_e32 v1, v1, v11
2965 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12
2966 ; GFX8-NEXT: v_mul_u32_u24_e32 v6, v6, v13
2967 ; GFX8-NEXT: v_mul_u32_u24_e32 v7, v7, v14
2968 ; GFX8-NEXT: v_mul_u32_u24_e32 v8, v8, v15
2969 ; GFX8-NEXT: v_mul_u32_u24_e32 v9, v9, v16
2970 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2971 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v10
2972 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v5
2973 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v6
2974 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v7
2975 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v8
2976 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v9
2977 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v4
2978 ; GFX8-NEXT: v_add_u16_e32 v0, v1, v0
2979 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
2980 ; GFX8-NEXT: flat_store_byte v[2:3], v0
2981 ; GFX8-NEXT: s_endpgm
2983 ; GFX9-LABEL: udot8_acc4_vecMul:
2984 ; GFX9: ; %bb.0: ; %entry
2985 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2986 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2987 ; GFX9-NEXT: s_mov_b32 s10, -1
2988 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2989 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2990 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2991 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2992 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2993 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2994 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2995 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2996 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
2997 ; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
2998 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2999 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v2
3000 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3001 ; GFX9-NEXT: v_and_b32_e32 v10, 15, v3
3002 ; GFX9-NEXT: v_bfe_u32 v4, v2, 4, 4
3003 ; GFX9-NEXT: v_bfe_u32 v11, v3, 4, 4
3004 ; GFX9-NEXT: v_bfe_u32 v5, v2, 8, 4
3005 ; GFX9-NEXT: v_bfe_u32 v12, v3, 8, 4
3006 ; GFX9-NEXT: v_bfe_u32 v6, v2, 12, 4
3007 ; GFX9-NEXT: v_bfe_u32 v13, v3, 12, 4
3008 ; GFX9-NEXT: v_bfe_u32 v7, v2, 16, 4
3009 ; GFX9-NEXT: v_bfe_u32 v14, v3, 16, 4
3010 ; GFX9-NEXT: v_bfe_u32 v8, v2, 20, 4
3011 ; GFX9-NEXT: v_bfe_u32 v9, v2, 24, 4
3012 ; GFX9-NEXT: v_bfe_u32 v15, v3, 20, 4
3013 ; GFX9-NEXT: v_bfe_u32 v16, v3, 24, 4
3014 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3015 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v3
3016 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v2, v3
3017 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v9, v16
3018 ; GFX9-NEXT: global_load_ubyte v9, v1, s[2:3]
3019 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v10
3020 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
3021 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
3022 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
3023 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
3024 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
3025 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3026 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v9
3027 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v4
3028 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v5
3029 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v6
3030 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v7
3031 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
3032 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v3
3033 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v2
3034 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
3035 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
3036 ; GFX9-NEXT: s_endpgm
3038 ; GFX9-DL-LABEL: udot8_acc4_vecMul:
3039 ; GFX9-DL: ; %bb.0: ; %entry
3040 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3041 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3042 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
3043 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
3044 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
3045 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3046 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3047 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3048 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
3049 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
3050 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3051 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
3052 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
3053 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3054 ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v2
3055 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3056 ; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3
3057 ; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 4, 4
3058 ; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 4, 4
3059 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 4
3060 ; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 8, 4
3061 ; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
3062 ; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 12, 4
3063 ; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 16, 4
3064 ; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 16, 4
3065 ; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 20, 4
3066 ; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 24, 4
3067 ; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 20, 4
3068 ; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 24, 4
3069 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3070 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
3071 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3
3072 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, v9, v16
3073 ; GFX9-DL-NEXT: global_load_ubyte v9, v1, s[2:3]
3074 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v10
3075 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11
3076 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12
3077 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13
3078 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14
3079 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15
3080 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3081 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9
3082 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v4
3083 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5
3084 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v6
3085 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7
3086 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
3087 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v3
3088 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v2
3089 ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v0
3090 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
3091 ; GFX9-DL-NEXT: s_endpgm
3093 ; GFX10-DL-LABEL: udot8_acc4_vecMul:
3094 ; GFX10-DL: ; %bb.0: ; %entry
3095 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3096 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3097 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
3098 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
3099 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
3100 ; GFX10-DL-NEXT: s_clause 0x1
3101 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3102 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3103 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3104 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
3105 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3106 ; GFX10-DL-NEXT: s_clause 0x1
3107 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
3108 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
3109 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3110 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
3111 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
3112 ; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
3113 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3114 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
3115 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4
3116 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4
3117 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
3118 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5
3119 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 8, 4
3120 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7
3121 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 12, 4
3122 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3123 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
3124 ; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 12, 4
3125 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v8
3126 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 16, 4
3127 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
3128 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4
3129 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v7
3130 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 20, 4
3131 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
3132 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4
3133 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8
3134 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
3135 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3136 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
3137 ; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4
3138 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7
3139 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
3140 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
3141 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v8
3142 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
3143 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
3144 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4
3145 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
3146 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
3147 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
3148 ; GFX10-DL-NEXT: s_endpgm
3149 <8 x i4> addrspace(1)* %src2,
3150 i4 addrspace(1)* nocapture %dst) {
3152 %idx = call i32 @llvm.amdgcn.workitem.id.x()
3153 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3154 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3155 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3156 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3158 %mul = mul <8 x i4> %vec1, %vec2
3159 %mul0 = extractelement <8 x i4> %mul, i64 0
3160 %mul1 = extractelement <8 x i4> %mul, i64 1
3161 %mul2 = extractelement <8 x i4> %mul, i64 2
3162 %mul3 = extractelement <8 x i4> %mul, i64 3
3163 %mul4 = extractelement <8 x i4> %mul, i64 4
3164 %mul5 = extractelement <8 x i4> %mul, i64 5
3165 %mul6 = extractelement <8 x i4> %mul, i64 6
3166 %mul7 = extractelement <8 x i4> %mul, i64 7
3168 %acc = load i4, i4 addrspace(1)* %dst, align 4
3169 %add1 = add i4 %mul0, %acc
3170 %add2 = add i4 %add1, %mul1
3171 %add3 = add i4 %add2, %mul2
3172 %add4 = add i4 %add3, %mul3
3173 %add5 = add i4 %add4, %mul4
3174 %add6 = add i4 %add5, %mul5
3175 %add7 = add i4 %add6, %mul6
3176 %add8 = add i4 %add7, %mul7
3178 store i4 %add8, i4 addrspace(1)* %dst, align 4
3182 define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
3183 ; GFX7-LABEL: udot8_variant1:
3184 ; GFX7: ; %bb.0: ; %entry
3185 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3186 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3187 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
3188 ; GFX7-NEXT: s_mov_b32 s10, 0
3189 ; GFX7-NEXT: s_mov_b32 s11, s3
3190 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3191 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
3192 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3193 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
3194 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
3195 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
3196 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
3197 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
3198 ; GFX7-NEXT: s_mov_b32 s2, -1
3199 ; GFX7-NEXT: s_waitcnt vmcnt(1)
3200 ; GFX7-NEXT: v_and_b32_e32 v1, 15, v2
3201 ; GFX7-NEXT: v_bfe_u32 v3, v2, 4, 4
3202 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3203 ; GFX7-NEXT: v_and_b32_e32 v9, 15, v0
3204 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4
3205 ; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4
3206 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
3207 ; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4
3208 ; GFX7-NEXT: v_bfe_u32 v8, v2, 24, 4
3209 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3210 ; GFX7-NEXT: v_bfe_u32 v10, v0, 4, 4
3211 ; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4
3212 ; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
3213 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
3214 ; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4
3215 ; GFX7-NEXT: v_bfe_u32 v15, v0, 24, 4
3216 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0
3217 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3218 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s4
3219 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
3220 ; GFX7-NEXT: v_mad_u32_u24 v0, v10, v3, v0
3221 ; GFX7-NEXT: v_mad_u32_u24 v0, v11, v4, v0
3222 ; GFX7-NEXT: v_mad_u32_u24 v0, v12, v5, v0
3223 ; GFX7-NEXT: v_mad_u32_u24 v0, v13, v6, v0
3224 ; GFX7-NEXT: v_mad_u32_u24 v0, v14, v7, v0
3225 ; GFX7-NEXT: v_mad_u32_u24 v0, v15, v8, v0
3226 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
3227 ; GFX7-NEXT: s_endpgm
3229 ; GFX8-LABEL: udot8_variant1:
3230 ; GFX8: ; %bb.0: ; %entry
3231 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3232 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3233 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3234 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3235 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
3236 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
3237 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3238 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
3239 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
3240 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
3241 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3242 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
3243 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
3244 ; GFX8-NEXT: s_waitcnt vmcnt(1)
3245 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v3
3246 ; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4
3247 ; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4
3248 ; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4
3249 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3250 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
3251 ; GFX8-NEXT: v_bfe_u32 v5, v0, 4, 4
3252 ; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4
3253 ; GFX8-NEXT: v_bfe_u32 v9, v0, 12, 4
3254 ; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4
3255 ; GFX8-NEXT: v_bfe_u32 v11, v0, 16, 4
3256 ; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4
3257 ; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4
3258 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4
3259 ; GFX8-NEXT: v_bfe_u32 v15, v0, 24, 4
3260 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
3261 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
3262 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3263 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s2
3264 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1
3265 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0
3266 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0
3267 ; GFX8-NEXT: v_mad_u32_u24 v0, v9, v8, v0
3268 ; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0
3269 ; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0
3270 ; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0
3271 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3272 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3273 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3274 ; GFX8-NEXT: s_endpgm
3276 ; GFX9-LABEL: udot8_variant1:
3277 ; GFX9: ; %bb.0: ; %entry
3278 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3279 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3280 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3281 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3282 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
3283 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
3284 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
3285 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3286 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3287 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v1
3288 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3289 ; GFX9-NEXT: v_and_b32_e32 v4, 15, v2
3290 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4
3291 ; GFX9-NEXT: v_bfe_u32 v6, v2, 4, 4
3292 ; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
3293 ; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4
3294 ; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4
3295 ; GFX9-NEXT: v_bfe_u32 v10, v2, 12, 4
3296 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4
3297 ; GFX9-NEXT: v_bfe_u32 v12, v2, 16, 4
3298 ; GFX9-NEXT: v_bfe_u32 v13, v1, 20, 4
3299 ; GFX9-NEXT: v_bfe_u32 v15, v1, 24, 4
3300 ; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4
3301 ; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
3302 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1
3303 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
3304 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v4, v3
3305 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v2, v1
3306 ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5
3307 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7
3308 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3309 ; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1
3310 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9
3311 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11
3312 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5
3313 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v14, v13
3314 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15
3315 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7
3316 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9
3317 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
3318 ; GFX9-NEXT: s_endpgm
3320 ; GFX9-DL-LABEL: udot8_variant1:
3321 ; GFX9-DL: ; %bb.0: ; %entry
3322 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3323 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3324 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3325 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
3326 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3327 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
3328 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
3329 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
3330 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3331 ; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v3, v2, s0
3332 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
3333 ; GFX9-DL-NEXT: s_endpgm
3335 ; GFX10-DL-LABEL: udot8_variant1:
3336 ; GFX10-DL: ; %bb.0: ; %entry
3337 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3338 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3339 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3340 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3341 ; GFX10-DL-NEXT: s_clause 0x1
3342 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
3343 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
3344 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3345 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
3346 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3347 ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2
3348 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
3349 ; GFX10-DL-NEXT: s_endpgm
3350 i32 addrspace(1)* %v2addr,
3351 i32 addrspace(1)* %dst) {
3353 %idx = call i32 @llvm.amdgcn.workitem.id.x()
3354 %gep1 = getelementptr i32, i32 addrspace(1)* %v1addr, i32 %idx
3355 %v1 = load i32, i32 addrspace(1)* %gep1, align 4
3356 %gep2 = getelementptr i32, i32 addrspace(1)* %v2addr, i32 %idx
3357 %v2 = load i32, i32 addrspace(1)* %gep2, align 4
3358 %and = and i32 %v1, 15
3359 %and1 = and i32 %v2, 15
3360 %mul1 = mul nuw nsw i32 %and1, %and
3362 %shr = lshr i32 %v1, 4
3363 %and2 = and i32 %shr, 15
3364 %shr3 = lshr i32 %v2, 4
3365 %and4 = and i32 %shr3, 15
3366 %mul2 = mul nuw nsw i32 %and4, %and2
3368 %shr6 = lshr i32 %v1, 8
3369 %and7 = and i32 %shr6, 15
3370 %shr8 = lshr i32 %v2, 8
3371 %and9 = and i32 %shr8, 15
3372 %mul3 = mul nuw nsw i32 %and9, %and7
3374 %shr12 = lshr i32 %v1, 12
3375 %and13 = and i32 %shr12, 15
3376 %shr14 = lshr i32 %v2, 12
3377 %and15 = and i32 %shr14, 15
3378 %mul4 = mul nuw nsw i32 %and15, %and13
3380 %shr18 = lshr i32 %v1, 16
3381 %and19 = and i32 %shr18, 15
3382 %shr20 = lshr i32 %v2, 16
3383 %and21 = and i32 %shr20, 15
3384 %mul5 = mul nuw nsw i32 %and21, %and19
3386 %shr24 = lshr i32 %v1, 20
3387 %and25 = and i32 %shr24, 15
3388 %shr26 = lshr i32 %v2, 20
3389 %and27 = and i32 %shr26, 15
3390 %mul6 = mul nuw nsw i32 %and27, %and25
3392 %shr30 = lshr i32 %v1, 24
3393 %and31 = and i32 %shr30, 15
3394 %shr32 = lshr i32 %v2, 24
3395 %and33 = and i32 %shr32, 15
3396 %mul7 = mul nuw nsw i32 %and33, %and31
3398 %shr36 = lshr i32 %v1, 28
3399 %shr37 = lshr i32 %v2, 28
3400 %mul8 = mul nuw nsw i32 %shr37, %shr36
3401 %acc = load i32, i32 addrspace(1)* %dst, align 4
3403 %add1 = add i32 %mul1, %acc
3404 %add2 = add i32 %add1, %mul8
3405 %add3 = add i32 %add2, %mul2
3406 %add4 = add i32 %add3, %mul3
3407 %add5 = add i32 %add4, %mul4
3408 %add6 = add i32 %add5, %mul5
3409 %add7 = add i32 %add6, %mul6
3410 %add8 = add i32 %add7, %mul7
3411 store i32 %add8, i32 addrspace(1)* %dst, align 4
3415 declare i32 @llvm.amdgcn.workitem.id.x()