1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
9 define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
10 ; GFX7-LABEL: udot4_acc32:
11 ; GFX7: ; %bb.0: ; %entry
12 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
13 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
14 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
15 ; GFX7-NEXT: s_mov_b32 s10, 0
16 ; GFX7-NEXT: s_mov_b32 s11, s3
17 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
19 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
20 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
21 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
22 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
23 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
24 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
25 ; GFX7-NEXT: s_mov_b32 s2, -1
26 ; GFX7-NEXT: s_waitcnt vmcnt(1)
27 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
28 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
29 ; GFX7-NEXT: s_waitcnt vmcnt(0)
30 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
31 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
32 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4
34 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
35 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
36 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
37 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
38 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
39 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
40 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
41 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
44 ; GFX8-LABEL: udot4_acc32:
45 ; GFX8: ; %bb.0: ; %entry
46 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
47 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
48 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
49 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
51 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
52 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
53 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
54 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
55 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
56 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
57 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
58 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
59 ; GFX8-NEXT: s_waitcnt vmcnt(1)
60 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
61 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
62 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
63 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
64 ; GFX8-NEXT: s_waitcnt vmcnt(0)
65 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
66 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
67 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
69 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
70 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
71 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
72 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
73 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
74 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
75 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
76 ; GFX8-NEXT: flat_store_dword v[0:1], v2
79 ; GFX9-NODL-LABEL: udot4_acc32:
80 ; GFX9-NODL: ; %bb.0: ; %entry
81 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
82 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
83 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
84 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
86 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
87 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
88 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
89 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
90 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
91 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
92 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
93 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
94 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
95 ; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
96 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
97 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
98 ; GFX9-NODL-NEXT: s_endpgm
100 ; GFX9-DL-LABEL: udot4_acc32:
101 ; GFX9-DL: ; %bb.0: ; %entry
102 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
103 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
104 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
105 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
107 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
108 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
109 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
110 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
111 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
112 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
113 ; GFX9-DL-NEXT: s_endpgm
115 ; GFX10-DL-LABEL: udot4_acc32:
116 ; GFX10-DL: ; %bb.0: ; %entry
117 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
118 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
119 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
120 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
121 ; GFX10-DL-NEXT: s_clause 0x1
122 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
123 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
124 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
125 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
126 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
127 ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
128 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
129 ; GFX10-DL-NEXT: s_endpgm
130 ptr addrspace(1) %src2,
131 ptr addrspace(1) nocapture %dst) {
133 %idx = call i32 @llvm.amdgcn.workitem.id.x()
134 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
135 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
136 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
137 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
139 %v1e0 = extractelement <4 x i8> %vec1, i64 0
140 %cv1e0 = zext i8 %v1e0 to i32
141 %v2e0 = extractelement <4 x i8> %vec2, i64 0
142 %cv2e0 = zext i8 %v2e0 to i32
143 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
145 %v1e1 = extractelement <4 x i8> %vec1, i64 1
146 %cv1e1 = zext i8 %v1e1 to i32
147 %v2e1 = extractelement <4 x i8> %vec2, i64 1
148 %cv2e1 = zext i8 %v2e1 to i32
149 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
151 %v1e2 = extractelement <4 x i8> %vec1, i64 2
152 %cv1e2 = zext i8 %v1e2 to i32
153 %v2e2 = extractelement <4 x i8> %vec2, i64 2
154 %cv2e2 = zext i8 %v2e2 to i32
155 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
157 %v1e3 = extractelement <4 x i8> %vec1, i64 3
158 %cv1e3 = zext i8 %v1e3 to i32
159 %v2e3 = extractelement <4 x i8> %vec2, i64 3
160 %cv2e3 = zext i8 %v2e3 to i32
161 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
163 %acc = load i32, ptr addrspace(1) %dst, align 4
164 %mad1 = add i32 %mul1, %acc
165 %mad2 = add i32 %mad1, %mul2
166 %mad3 = add i32 %mad2, %mul3
167 %mad4 = add i32 %mad3, %mul4
169 store i32 %mad4, ptr addrspace(1) %dst, align 4
173 define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
174 ; GFX7-LABEL: udot4_acc16:
175 ; GFX7: ; %bb.0: ; %entry
176 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
177 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
178 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
179 ; GFX7-NEXT: s_mov_b32 s10, 0
180 ; GFX7-NEXT: s_mov_b32 s11, s3
181 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
182 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
183 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
184 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
185 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
186 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
187 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
188 ; GFX7-NEXT: s_mov_b32 s2, -1
189 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
190 ; GFX7-NEXT: s_waitcnt vmcnt(2)
191 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
192 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
193 ; GFX7-NEXT: s_waitcnt vmcnt(1)
194 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
195 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
196 ; GFX7-NEXT: s_waitcnt vmcnt(0)
197 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
198 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
199 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
200 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
201 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
202 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
203 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
204 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
205 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
206 ; GFX7-NEXT: s_endpgm
208 ; GFX8-LABEL: udot4_acc16:
209 ; GFX8: ; %bb.0: ; %entry
210 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
211 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
212 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
213 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
214 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
215 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
216 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
217 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
218 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
219 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
220 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
221 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
222 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
223 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
224 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
225 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
226 ; GFX8-NEXT: s_waitcnt vmcnt(2)
227 ; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3
228 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
229 ; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
230 ; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
231 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
232 ; GFX8-NEXT: s_waitcnt vmcnt(1)
233 ; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2
234 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2
235 ; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9
236 ; GFX8-NEXT: s_waitcnt vmcnt(0)
237 ; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4
238 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
239 ; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4
240 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
241 ; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4
242 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
243 ; GFX8-NEXT: flat_store_short v[0:1], v2
244 ; GFX8-NEXT: s_endpgm
246 ; GFX9-NODL-LABEL: udot4_acc16:
247 ; GFX9-NODL: ; %bb.0: ; %entry
248 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
249 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
250 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
251 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
252 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
254 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
255 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
256 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3]
257 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
258 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1
259 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
260 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2
261 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
262 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
263 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6
264 ; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7
265 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
266 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3
267 ; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
268 ; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
269 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3
270 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
271 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
272 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3
273 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
274 ; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3]
275 ; GFX9-NODL-NEXT: s_endpgm
277 ; GFX9-DL-LABEL: udot4_acc16:
278 ; GFX9-DL: ; %bb.0: ; %entry
279 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
280 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
281 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
282 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
283 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
285 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
286 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
287 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
288 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
289 ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1
290 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
291 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2
292 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
293 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
294 ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
295 ; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7
296 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
297 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3
298 ; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
299 ; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
300 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3
301 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
302 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
303 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3
304 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
305 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
306 ; GFX9-DL-NEXT: s_endpgm
308 ; GFX10-DL-LABEL: udot4_acc16:
309 ; GFX10-DL: ; %bb.0: ; %entry
310 ; GFX10-DL-NEXT: s_clause 0x1
311 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
312 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
313 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
314 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff
315 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
316 ; GFX10-DL-NEXT: s_clause 0x1
317 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
318 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
319 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
320 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
321 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
322 ; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1
323 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
324 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
325 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
326 ; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2
327 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
328 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
329 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
330 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
331 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
332 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
333 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
334 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
335 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3
336 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
337 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
338 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
339 ; GFX10-DL-NEXT: s_endpgm
340 ptr addrspace(1) %src2,
341 ptr addrspace(1) nocapture %dst) {
343 %idx = call i32 @llvm.amdgcn.workitem.id.x()
344 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
345 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
346 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
347 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
349 %v1e0 = extractelement <4 x i8> %vec1, i64 0
350 %cv1e0 = zext i8 %v1e0 to i16
351 %v2e0 = extractelement <4 x i8> %vec2, i64 0
352 %cv2e0 = zext i8 %v2e0 to i16
353 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
355 %v1e1 = extractelement <4 x i8> %vec1, i64 1
356 %cv1e1 = zext i8 %v1e1 to i16
357 %v2e1 = extractelement <4 x i8> %vec2, i64 1
358 %cv2e1 = zext i8 %v2e1 to i16
359 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
361 %v1e2 = extractelement <4 x i8> %vec1, i64 2
362 %cv1e2 = zext i8 %v1e2 to i16
363 %v2e2 = extractelement <4 x i8> %vec2, i64 2
364 %cv2e2 = zext i8 %v2e2 to i16
365 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
367 %v1e3 = extractelement <4 x i8> %vec1, i64 3
368 %cv1e3 = zext i8 %v1e3 to i16
369 %v2e3 = extractelement <4 x i8> %vec2, i64 3
370 %cv2e3 = zext i8 %v2e3 to i16
371 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
373 %acc = load i16, ptr addrspace(1) %dst, align 2
374 %mad1 = add i16 %mul1, %acc
375 %mad2 = add i16 %mad1, %mul2
376 %mad3 = add i16 %mad2, %mul3
377 %mad4 = add i16 %mad3, %mul4
379 store i16 %mad4, ptr addrspace(1) %dst, align 2
383 define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
384 ; GFX7-LABEL: udot4_acc8:
385 ; GFX7: ; %bb.0: ; %entry
386 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
387 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
388 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
389 ; GFX7-NEXT: s_mov_b32 s10, 0
390 ; GFX7-NEXT: s_mov_b32 s11, s3
391 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
393 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
394 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
395 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
396 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
397 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
398 ; GFX7-NEXT: s_mov_b32 s2, -1
399 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
400 ; GFX7-NEXT: s_waitcnt vmcnt(2)
401 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
402 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
403 ; GFX7-NEXT: s_waitcnt vmcnt(1)
404 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
405 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
406 ; GFX7-NEXT: s_waitcnt vmcnt(0)
407 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
408 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
409 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
410 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
411 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
412 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
413 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
414 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
415 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
416 ; GFX7-NEXT: s_endpgm
418 ; GFX8-LABEL: udot4_acc8:
419 ; GFX8: ; %bb.0: ; %entry
420 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
421 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
422 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
423 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
425 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
426 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
427 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
428 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
429 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
430 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
431 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
432 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
433 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
434 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
435 ; GFX8-NEXT: s_waitcnt vmcnt(2)
436 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
437 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
438 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
439 ; GFX8-NEXT: s_waitcnt vmcnt(1)
440 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
441 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
442 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
443 ; GFX8-NEXT: s_waitcnt vmcnt(0)
444 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
445 ; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2
446 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
447 ; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2
448 ; GFX8-NEXT: flat_store_byte v[0:1], v2
449 ; GFX8-NEXT: s_endpgm
451 ; GFX9-NODL-LABEL: udot4_acc8:
452 ; GFX9-NODL: ; %bb.0: ; %entry
453 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
454 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
455 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
456 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
457 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
458 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
459 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
460 ; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3]
461 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
462 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
463 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
464 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
465 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
466 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
467 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
468 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
469 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
470 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1
471 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2
472 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
473 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1
474 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3]
475 ; GFX9-NODL-NEXT: s_endpgm
477 ; GFX9-DL-LABEL: udot4_acc8:
478 ; GFX9-DL: ; %bb.0: ; %entry
479 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
480 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
481 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
482 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
484 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
485 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
486 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
487 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
488 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
489 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
490 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
491 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
492 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
493 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
494 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
495 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
496 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1
497 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2
498 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
499 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1
500 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
501 ; GFX9-DL-NEXT: s_endpgm
503 ; GFX10-DL-LABEL: udot4_acc8:
504 ; GFX10-DL: ; %bb.0: ; %entry
505 ; GFX10-DL-NEXT: s_clause 0x1
506 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
507 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
508 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
509 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
510 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
511 ; GFX10-DL-NEXT: s_clause 0x1
512 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
513 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
514 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
515 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
516 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
517 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
518 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
519 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
520 ; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4
521 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
522 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
523 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
524 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
525 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
526 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
527 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
528 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
529 ; GFX10-DL-NEXT: s_endpgm
530 ptr addrspace(1) %src2,
531 ptr addrspace(1) nocapture %dst) {
533 %idx = call i32 @llvm.amdgcn.workitem.id.x()
534 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
535 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
536 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
537 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
539 %v1e0 = extractelement <4 x i8> %vec1, i64 0
540 %v2e0 = extractelement <4 x i8> %vec2, i64 0
541 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
543 %v1e1 = extractelement <4 x i8> %vec1, i64 1
544 %v2e1 = extractelement <4 x i8> %vec2, i64 1
545 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
547 %v1e2 = extractelement <4 x i8> %vec1, i64 2
548 %v2e2 = extractelement <4 x i8> %vec2, i64 2
549 %mul3 = mul nuw nsw i8 %v1e2, %v2e2
551 %v1e3 = extractelement <4 x i8> %vec1, i64 3
552 %v2e3 = extractelement <4 x i8> %vec2, i64 3
553 %mul4 = mul nuw nsw i8 %v1e3, %v2e3
555 %acc = load i8, ptr addrspace(1) %dst, align 2
556 %mad1 = add i8 %mul1, %acc
557 %mad2 = add i8 %mad1, %mul2
558 %mad3 = add i8 %mad2, %mul3
559 %mad4 = add i8 %mad3, %mul4
561 store i8 %mad4, ptr addrspace(1) %dst, align 2
565 ; TODO: Generate udot4?
566 define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
567 ; GFX7-LABEL: udot2_8:
568 ; GFX7: ; %bb.0: ; %entry
569 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
570 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
571 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
572 ; GFX7-NEXT: s_mov_b32 s10, 0
573 ; GFX7-NEXT: s_mov_b32 s11, s3
574 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
575 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
576 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
577 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
578 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
579 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
580 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
581 ; GFX7-NEXT: s_mov_b32 s2, -1
582 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
583 ; GFX7-NEXT: s_waitcnt vmcnt(2)
584 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
585 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
586 ; GFX7-NEXT: s_waitcnt vmcnt(1)
587 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
588 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
589 ; GFX7-NEXT: s_waitcnt vmcnt(0)
590 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
591 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
592 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
593 ; GFX7-NEXT: s_endpgm
595 ; GFX8-LABEL: udot2_8:
596 ; GFX8: ; %bb.0: ; %entry
597 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
598 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
599 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
600 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
601 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
602 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
603 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
604 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
605 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
606 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
607 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
608 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
609 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
610 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
611 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
612 ; GFX8-NEXT: s_waitcnt vmcnt(2)
613 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3
614 ; GFX8-NEXT: s_waitcnt vmcnt(1)
615 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2
616 ; GFX8-NEXT: s_waitcnt vmcnt(0)
617 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
618 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
619 ; GFX8-NEXT: flat_store_byte v[0:1], v2
620 ; GFX8-NEXT: s_endpgm
622 ; GFX9-NODL-LABEL: udot2_8:
623 ; GFX9-NODL: ; %bb.0: ; %entry
624 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
625 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
626 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
627 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
628 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
629 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
630 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
631 ; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3]
632 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
633 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
634 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
635 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
636 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
637 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4
638 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2
639 ; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3]
640 ; GFX9-NODL-NEXT: s_endpgm
642 ; GFX9-DL-LABEL: udot2_8:
643 ; GFX9-DL: ; %bb.0: ; %entry
644 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
645 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
646 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
647 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
648 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
650 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
651 ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
652 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
653 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
654 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
655 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
656 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
657 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4
658 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2
659 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
660 ; GFX9-DL-NEXT: s_endpgm
662 ; GFX10-DL-LABEL: udot2_8:
663 ; GFX10-DL: ; %bb.0: ; %entry
664 ; GFX10-DL-NEXT: s_clause 0x1
665 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
666 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
667 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
668 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
669 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX10-DL-NEXT: s_clause 0x1
671 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
672 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
673 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
674 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
675 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
676 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
677 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
678 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
679 ; GFX10-DL-NEXT: v_mad_u16 v2, v2, v3, v4
680 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v2
681 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
682 ; GFX10-DL-NEXT: s_endpgm
683 ptr addrspace(1) %src2,
684 ptr addrspace(1) nocapture %dst) {
686 %idx = call i32 @llvm.amdgcn.workitem.id.x()
687 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
688 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
689 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
690 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
692 %v1e0 = extractelement <4 x i8> %vec1, i64 0
693 %v2e0 = extractelement <4 x i8> %vec2, i64 0
694 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
696 %v1e1 = extractelement <4 x i8> %vec1, i64 1
697 %v2e1 = extractelement <4 x i8> %vec2, i64 1
698 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
700 %acc = load i8, ptr addrspace(1) %dst, align 2
701 %mad1 = add i8 %mul1, %acc
702 %mad2 = add i8 %mad1, %mul2
703 store i8 %mad2, ptr addrspace(1) %dst, align 2
707 define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
708 ; GFX7-LABEL: udot4_CommutationInsideMAD:
709 ; GFX7: ; %bb.0: ; %entry
710 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
711 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
712 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
713 ; GFX7-NEXT: s_mov_b32 s10, 0
714 ; GFX7-NEXT: s_mov_b32 s11, s3
715 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
716 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
717 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
718 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
719 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
720 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
721 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
722 ; GFX7-NEXT: s_mov_b32 s2, -1
723 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
724 ; GFX7-NEXT: s_waitcnt vmcnt(2)
725 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
726 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
727 ; GFX7-NEXT: s_waitcnt vmcnt(1)
728 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
729 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
730 ; GFX7-NEXT: s_waitcnt vmcnt(0)
731 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
732 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
733 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
734 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
735 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
736 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
737 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1
738 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
739 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
740 ; GFX7-NEXT: s_endpgm
742 ; GFX8-LABEL: udot4_CommutationInsideMAD:
743 ; GFX8: ; %bb.0: ; %entry
744 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
745 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
746 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
747 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
749 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
750 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
751 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
752 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
753 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
754 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
755 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
756 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
757 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
758 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
759 ; GFX8-NEXT: s_waitcnt vmcnt(2)
760 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
761 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
762 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
763 ; GFX8-NEXT: s_waitcnt vmcnt(1)
764 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
765 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
766 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
767 ; GFX8-NEXT: s_waitcnt vmcnt(0)
768 ; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4
769 ; GFX8-NEXT: v_mad_u16 v2, v8, v7, v2
770 ; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2
771 ; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2
772 ; GFX8-NEXT: flat_store_byte v[0:1], v2
773 ; GFX8-NEXT: s_endpgm
775 ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
776 ; GFX9-NODL: ; %bb.0: ; %entry
777 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
778 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
779 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
780 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
781 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
782 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
783 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
784 ; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3]
785 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
786 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
787 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
788 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
789 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
790 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
791 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
792 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3
793 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
794 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1
795 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2
796 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1
797 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1
798 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3]
799 ; GFX9-NODL-NEXT: s_endpgm
801 ; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
802 ; GFX9-DL: ; %bb.0: ; %entry
803 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
804 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
805 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
806 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
807 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
808 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
809 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
810 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
811 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
812 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
813 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
814 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
815 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
816 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
817 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
818 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3
819 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
820 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1
821 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2
822 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1
823 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1
824 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
825 ; GFX9-DL-NEXT: s_endpgm
827 ; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
828 ; GFX10-DL: ; %bb.0: ; %entry
829 ; GFX10-DL-NEXT: s_clause 0x1
830 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
831 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
832 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
833 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
834 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
835 ; GFX10-DL-NEXT: s_clause 0x1
836 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
837 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
838 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
839 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
840 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
841 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
842 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
843 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
844 ; GFX10-DL-NEXT: v_mad_u16 v4, v3, v2, v4
845 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
846 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
847 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
848 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
849 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4
850 ; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0
851 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
852 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
853 ; GFX10-DL-NEXT: s_endpgm
854 ptr addrspace(1) %src2,
855 ptr addrspace(1) nocapture %dst) {
857 %idx = call i32 @llvm.amdgcn.workitem.id.x()
858 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
859 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
860 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
861 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
863 %v1e0 = extractelement <4 x i8> %vec1, i64 0
864 %v2e0 = extractelement <4 x i8> %vec2, i64 0
865 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
867 %v1e1 = extractelement <4 x i8> %vec1, i64 1
868 %v2e1 = extractelement <4 x i8> %vec2, i64 1
869 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
871 %v1e2 = extractelement <4 x i8> %vec1, i64 2
872 %v2e2 = extractelement <4 x i8> %vec2, i64 2
873 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
875 %v1e3 = extractelement <4 x i8> %vec1, i64 3
876 %v2e3 = extractelement <4 x i8> %vec2, i64 3
877 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
879 %acc = load i8, ptr addrspace(1) %dst, align 2
880 %mad1 = add i8 %acc, %mul1
881 %mad2 = add i8 %mul2, %mad1
882 %mad3 = add i8 %mul3, %mad2
883 %mad4 = add i8 %mul4, %mad3
885 store i8 %mad4, ptr addrspace(1) %dst, align 2
889 ; TODO: Support commutation accross the adds.
890 define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
891 ; GFX7-LABEL: udot4_CommutationAccrossMADs:
892 ; GFX7: ; %bb.0: ; %entry
893 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
894 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
895 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
896 ; GFX7-NEXT: s_mov_b32 s10, 0
897 ; GFX7-NEXT: s_mov_b32 s11, s3
898 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
899 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
900 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
901 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
902 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
903 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
904 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
905 ; GFX7-NEXT: s_mov_b32 s2, -1
906 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
907 ; GFX7-NEXT: s_waitcnt vmcnt(2)
908 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
909 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
910 ; GFX7-NEXT: s_waitcnt vmcnt(1)
911 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
912 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
913 ; GFX7-NEXT: s_waitcnt vmcnt(0)
914 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
915 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
916 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
917 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
918 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
919 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
920 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1
921 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
922 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
923 ; GFX7-NEXT: s_endpgm
925 ; GFX8-LABEL: udot4_CommutationAccrossMADs:
926 ; GFX8: ; %bb.0: ; %entry
927 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
928 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
929 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
930 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
931 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
932 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
933 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
934 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
935 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
936 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
937 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
938 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
939 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
940 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
941 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
942 ; GFX8-NEXT: s_waitcnt vmcnt(2)
943 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
944 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
945 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3
946 ; GFX8-NEXT: s_waitcnt vmcnt(1)
947 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2
948 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
949 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2
950 ; GFX8-NEXT: s_waitcnt vmcnt(0)
951 ; GFX8-NEXT: v_mad_u16 v4, v8, v7, v4
952 ; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4
953 ; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2
954 ; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2
955 ; GFX8-NEXT: flat_store_byte v[0:1], v2
956 ; GFX8-NEXT: s_endpgm
958 ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
959 ; GFX9-NODL: ; %bb.0: ; %entry
960 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
961 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
962 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
963 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
964 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
965 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
966 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
967 ; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3]
968 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
969 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
970 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
971 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
972 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
973 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
974 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3
975 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
976 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
977 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3
978 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2
979 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1
980 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1
981 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3]
982 ; GFX9-NODL-NEXT: s_endpgm
984 ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
985 ; GFX9-DL: ; %bb.0: ; %entry
986 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
987 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
988 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
989 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
990 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
991 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
992 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
993 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
994 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
995 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
996 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
997 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
998 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
999 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1000 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3
1001 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1002 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
1003 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3
1004 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v2
1005 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1
1006 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1
1007 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
1008 ; GFX9-DL-NEXT: s_endpgm
1010 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
1011 ; GFX10-DL: ; %bb.0: ; %entry
1012 ; GFX10-DL-NEXT: s_clause 0x1
1013 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1014 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1015 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1016 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1017 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX10-DL-NEXT: s_clause 0x1
1019 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
1020 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
1021 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
1022 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1023 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
1024 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1025 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
1026 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1027 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4
1028 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1029 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
1030 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
1031 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1032 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1033 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0
1034 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
1035 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
1036 ; GFX10-DL-NEXT: s_endpgm
1037 ptr addrspace(1) %src2,
1038 ptr addrspace(1) nocapture %dst) {
1040 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1041 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1042 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1043 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1044 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1046 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1047 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1048 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
1050 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1051 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1052 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
1054 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1055 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1056 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
1058 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1059 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1060 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
1062 %acc = load i8, ptr addrspace(1) %dst, align 2
1063 %mad1 = add i8 %acc, %mul2
1064 %mad2 = add i8 %mad1, %mul1
1065 %mad3 = add i8 %mad2, %mul3
1066 %mad4 = add i8 %mad3, %mul4
1068 store i8 %mad4, ptr addrspace(1) %dst, align 2
1072 define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
1073 ; GFX7-LABEL: udot4_multiuse_mul1:
1074 ; GFX7: ; %bb.0: ; %entry
1075 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1076 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1077 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1078 ; GFX7-NEXT: s_mov_b32 s10, 0
1079 ; GFX7-NEXT: s_mov_b32 s11, s3
1080 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1081 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1082 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1083 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1084 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1085 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1086 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1087 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1088 ; GFX7-NEXT: s_mov_b32 s2, -1
1089 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1090 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
1091 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
1092 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
1094 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
1095 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4
1097 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8
1098 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1099 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1100 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
1101 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1102 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1103 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1104 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1105 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1106 ; GFX7-NEXT: s_endpgm
1108 ; GFX8-LABEL: udot4_multiuse_mul1:
1109 ; GFX8: ; %bb.0: ; %entry
1110 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1111 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1112 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1113 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1114 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1115 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1116 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1117 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1118 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1119 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1120 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1121 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1122 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1123 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1124 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
1125 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
1126 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
1127 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1128 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1129 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
1130 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
1131 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1132 ; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2
1133 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8
1134 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
1135 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
1136 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1137 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
1138 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
1139 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1140 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1141 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1142 ; GFX8-NEXT: s_endpgm
1144 ; GFX9-NODL-LABEL: udot4_multiuse_mul1:
1145 ; GFX9-NODL: ; %bb.0: ; %entry
1146 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1147 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1148 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1149 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1150 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1151 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1152 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
1153 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1154 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1155 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
1156 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1157 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2
1158 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1159 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1160 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1161 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4
1162 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1163 ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0
1164 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2
1165 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
1166 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
1167 ; GFX9-NODL-NEXT: s_endpgm
1169 ; GFX9-DL-LABEL: udot4_multiuse_mul1:
1170 ; GFX9-DL: ; %bb.0: ; %entry
1171 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1172 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1173 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1174 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1175 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1176 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1177 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1178 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1179 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1180 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1
1181 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1182 ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2
1183 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1184 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1185 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1186 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4
1187 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1188 ; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0
1189 ; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2
1190 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1
1191 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1192 ; GFX9-DL-NEXT: s_endpgm
1194 ; GFX10-DL-LABEL: udot4_multiuse_mul1:
1195 ; GFX10-DL: ; %bb.0: ; %entry
1196 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1197 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1198 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1199 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1200 ; GFX10-DL-NEXT: s_clause 0x1
1201 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1202 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1203 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1204 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1205 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
1206 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
1208 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1209 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3
1210 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1211 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
1212 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1213 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1214 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1215 ; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5
1216 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
1217 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
1218 ; GFX10-DL-NEXT: s_endpgm
1219 ptr addrspace(1) %src2,
1220 ptr addrspace(1) nocapture %dst) {
1222 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1223 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1224 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1225 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1226 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1228 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1229 %cv1e0 = zext i8 %v1e0 to i32
1230 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1231 %cv2e0 = zext i8 %v2e0 to i32
1232 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1234 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1235 %cv1e1 = zext i8 %v1e1 to i32
1236 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1237 %cv2e1 = zext i8 %v2e1 to i32
1238 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1240 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1241 %cv1e2 = zext i8 %v1e2 to i32
1242 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1243 %cv2e2 = zext i8 %v2e2 to i32
1244 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1246 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1247 %cv1e3 = zext i8 %v1e3 to i32
1248 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1249 %cv2e3 = zext i8 %v2e3 to i32
1250 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1252 %acc = load i32, ptr addrspace(1) %dst, align 4
1253 %add = add i32 %mul1, %acc
1254 %add1 = add i32 %mul2, %add
1255 %add2 = add i32 %add1, %mul1
1256 %add3 = add i32 %add2, %mul3
1257 %add4 = add i32 %add3, %mul4
1259 store i32 %add4, ptr addrspace(1) %dst, align 4
1263 define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
1264 ; GFX7-LABEL: udot4_multiuse_add1:
1265 ; GFX7: ; %bb.0: ; %entry
1266 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1267 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1268 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1269 ; GFX7-NEXT: s_mov_b32 s10, 0
1270 ; GFX7-NEXT: s_mov_b32 s11, s3
1271 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1272 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1273 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1274 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1275 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1276 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1277 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1278 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1279 ; GFX7-NEXT: s_mov_b32 s2, -1
1280 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1281 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
1282 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
1283 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1284 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
1285 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
1286 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1287 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4
1288 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1289 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1290 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
1291 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1292 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1293 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1294 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3
1295 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1296 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6
1297 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1298 ; GFX7-NEXT: s_endpgm
1300 ; GFX8-LABEL: udot4_multiuse_add1:
1301 ; GFX8: ; %bb.0: ; %entry
1302 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1303 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1304 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1305 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1306 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1307 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1308 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1309 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1310 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1311 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1312 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1313 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1314 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1315 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1316 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
1317 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
1318 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
1319 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1320 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1321 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
1322 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
1323 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1324 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2
1325 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
1326 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
1327 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1328 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
1329 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4
1330 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1
1331 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5
1332 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1333 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1334 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1335 ; GFX8-NEXT: s_endpgm
1337 ; GFX9-NODL-LABEL: udot4_multiuse_add1:
1338 ; GFX9-NODL: ; %bb.0: ; %entry
1339 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1340 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1341 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1342 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1343 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1344 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1345 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
1346 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1347 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1348 ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
1349 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1350 ; GFX9-NODL-NEXT: v_bfe_u32 v5, v2, 8, 8
1351 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1352 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1353 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1354 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1355 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0
1356 ; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2
1357 ; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6
1358 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4
1359 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
1360 ; GFX9-NODL-NEXT: s_endpgm
1362 ; GFX9-DL-LABEL: udot4_multiuse_add1:
1363 ; GFX9-DL: ; %bb.0: ; %entry
1364 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1365 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1366 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1367 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1368 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1369 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1370 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1371 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1372 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1373 ; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 8, 8
1374 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1375 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 8
1376 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1377 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1378 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1379 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1380 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, s0
1381 ; GFX9-DL-NEXT: v_add_u32_e32 v4, s0, v2
1382 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v3, v6
1383 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v1, v4
1384 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1385 ; GFX9-DL-NEXT: s_endpgm
1387 ; GFX10-DL-LABEL: udot4_multiuse_add1:
1388 ; GFX10-DL: ; %bb.0: ; %entry
1389 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1390 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1391 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1392 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1393 ; GFX10-DL-NEXT: s_clause 0x1
1394 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1395 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1396 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1397 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1398 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 8, 8
1399 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1400 ; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8
1401 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1402 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1403 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
1404 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1405 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1406 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0
1407 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3
1408 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
1409 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
1410 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
1411 ; GFX10-DL-NEXT: s_endpgm
1412 ptr addrspace(1) %src2,
1413 ptr addrspace(1) nocapture %dst) {
1415 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1416 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1417 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1418 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1419 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1421 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1422 %cv1e0 = zext i8 %v1e0 to i32
1423 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1424 %cv2e0 = zext i8 %v2e0 to i32
1425 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1427 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1428 %cv1e1 = zext i8 %v1e1 to i32
1429 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1430 %cv2e1 = zext i8 %v2e1 to i32
1431 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1433 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1434 %cv1e2 = zext i8 %v1e2 to i32
1435 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1436 %cv2e2 = zext i8 %v2e2 to i32
1437 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1439 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1440 %cv1e3 = zext i8 %v1e3 to i32
1441 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1442 %cv2e3 = zext i8 %v2e3 to i32
1443 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1445 %acc = load i32, ptr addrspace(1) %dst, align 4
1446 %add1 = add i32 %mul2, %acc
1447 %add = add i32 %add1, %acc
1448 %add2 = add i32 %add1, %mul1
1449 %add3 = add i32 %add2, %mul3
1450 %add4 = add i32 %add3, %mul4
1451 %res = add i32 %add4, %add
1452 store i32 %res, ptr addrspace(1) %dst, align 4
1456 define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
1457 ; GFX7-LABEL: notdot4_mixedtypes:
1458 ; GFX7: ; %bb.0: ; %entry
1459 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1460 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1461 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1462 ; GFX7-NEXT: s_mov_b32 s10, 0
1463 ; GFX7-NEXT: s_mov_b32 s11, s3
1464 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1465 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1466 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1467 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1468 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1469 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1470 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1471 ; GFX7-NEXT: s_mov_b32 s2, -1
1472 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
1473 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1474 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
1475 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
1476 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1477 ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8
1478 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
1479 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
1480 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
1481 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1482 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1483 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
1484 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
1485 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
1486 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1487 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1488 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
1489 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1490 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1491 ; GFX7-NEXT: s_endpgm
1493 ; GFX8-LABEL: notdot4_mixedtypes:
1494 ; GFX8: ; %bb.0: ; %entry
1495 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1496 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1497 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1498 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
1499 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1500 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1501 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1502 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1503 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1504 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1505 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1506 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1507 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
1508 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1509 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1510 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
1511 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1512 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
1513 ; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
1514 ; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8
1515 ; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1516 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1517 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1518 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2
1519 ; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9
1520 ; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8
1521 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1522 ; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4
1523 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1524 ; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4
1525 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1526 ; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4
1527 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
1528 ; GFX8-NEXT: flat_store_short v[0:1], v2
1529 ; GFX8-NEXT: s_endpgm
1531 ; GFX9-NODL-LABEL: notdot4_mixedtypes:
1532 ; GFX9-NODL: ; %bb.0: ; %entry
1533 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1534 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1535 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1536 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
1537 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1538 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1539 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1540 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1541 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3]
1542 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
1543 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
1544 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1545 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
1546 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6
1547 ; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7
1548 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
1549 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8
1550 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1551 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3
1552 ; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1553 ; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1554 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3
1555 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
1556 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1557 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3
1558 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1559 ; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3]
1560 ; GFX9-NODL-NEXT: s_endpgm
1562 ; GFX9-DL-LABEL: notdot4_mixedtypes:
1563 ; GFX9-DL: ; %bb.0: ; %entry
1564 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1565 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1566 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1567 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
1568 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1569 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1570 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1571 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1572 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
1573 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1574 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1
1575 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1576 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
1577 ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
1578 ; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7
1579 ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
1580 ; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8
1581 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1582 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3
1583 ; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1584 ; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1585 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3
1586 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
1587 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1588 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3
1589 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1590 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
1591 ; GFX9-DL-NEXT: s_endpgm
1593 ; GFX10-DL-LABEL: notdot4_mixedtypes:
1594 ; GFX10-DL: ; %bb.0: ; %entry
1595 ; GFX10-DL-NEXT: s_clause 0x1
1596 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1597 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1598 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1599 ; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff
1600 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1601 ; GFX10-DL-NEXT: s_clause 0x1
1602 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1603 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1604 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1605 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
1606 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1607 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
1608 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1609 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
1610 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
1611 ; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8
1612 ; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
1613 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
1614 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1615 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
1616 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1617 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1618 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
1619 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1620 ; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3
1621 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
1622 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
1623 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
1624 ; GFX10-DL-NEXT: s_endpgm
1625 ptr addrspace(1) %src2,
1626 ptr addrspace(1) nocapture %dst) {
1628 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1629 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1630 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1631 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1632 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1634 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1635 %cv1e0 = sext i8 %v1e0 to i16
1636 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1637 %cv2e0 = sext i8 %v2e0 to i16
1638 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1640 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1641 %cv1e1 = zext i8 %v1e1 to i16
1642 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1643 %cv2e1 = zext i8 %v2e1 to i16
1644 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1646 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1647 %cv1e2 = zext i8 %v1e2 to i16
1648 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1649 %cv2e2 = zext i8 %v2e2 to i16
1650 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1652 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1653 %cv1e3 = zext i8 %v1e3 to i16
1654 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1655 %cv2e3 = zext i8 %v2e3 to i16
1656 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1658 %acc = load i16, ptr addrspace(1) %dst, align 2
1659 %add1 = add i16 %mul2, %acc
1660 %add2 = add i16 %add1, %mul1
1661 %add3 = add i16 %add2, %mul3
1662 %add4 = add i16 %add3, %mul4
1664 store i16 %add4, ptr addrspace(1) %dst, align 2
1668 ; TODO: cleanup s_lshr_b32 and support this pattern.
1669 define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
1670 ; GFX7-LABEL: udot4_acc32_vecMul:
1671 ; GFX7: ; %bb.0: ; %entry
1672 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1673 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1674 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1675 ; GFX7-NEXT: s_mov_b32 s10, 0
1676 ; GFX7-NEXT: s_mov_b32 s11, s3
1677 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1678 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1679 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1680 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1681 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1682 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1683 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1684 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1685 ; GFX7-NEXT: s_mov_b32 s2, -1
1686 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1687 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
1688 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
1689 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1690 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
1691 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1692 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0
1693 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
1694 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1695 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
1696 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1697 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
1698 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
1699 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0
1700 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0
1701 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1702 ; GFX7-NEXT: s_endpgm
1704 ; GFX8-LABEL: udot4_acc32_vecMul:
1705 ; GFX8: ; %bb.0: ; %entry
1706 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1707 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1708 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1709 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1710 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1711 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1712 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1713 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1714 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1715 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1716 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1717 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1718 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1719 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1720 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
1721 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8
1722 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3
1723 ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3
1724 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1725 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
1726 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
1727 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0
1728 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
1729 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1730 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
1731 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0
1732 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0
1733 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
1734 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1735 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1736 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1737 ; GFX8-NEXT: s_endpgm
1739 ; GFX9-NODL-LABEL: udot4_acc32_vecMul:
1740 ; GFX9-NODL: ; %bb.0: ; %entry
1741 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1742 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1743 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1744 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1745 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1746 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1747 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
1748 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1749 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1750 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1751 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1752 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1753 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1754 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1755 ; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
1756 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
1757 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
1758 ; GFX9-NODL-NEXT: s_endpgm
1760 ; GFX9-DL-LABEL: udot4_acc32_vecMul:
1761 ; GFX9-DL: ; %bb.0: ; %entry
1762 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1763 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1764 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1765 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1766 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1767 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1768 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1769 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1770 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1771 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1772 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1773 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1774 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1775 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1776 ; GFX9-DL-NEXT: v_add3_u32 v2, v3, s0, v4
1777 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v5, v1
1778 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1779 ; GFX9-DL-NEXT: s_endpgm
1781 ; GFX10-DL-LABEL: udot4_acc32_vecMul:
1782 ; GFX10-DL: ; %bb.0: ; %entry
1783 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1784 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1785 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1786 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1787 ; GFX10-DL-NEXT: s_clause 0x1
1788 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1789 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1790 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff
1791 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1792 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1793 ; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1794 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1795 ; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1796 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1797 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0
1798 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1799 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1800 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1801 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1802 ; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0
1803 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
1804 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
1805 ; GFX10-DL-NEXT: s_endpgm
1806 ptr addrspace(1) %src2,
1807 ptr addrspace(1) nocapture %dst) {
1809 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1810 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1811 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1812 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1813 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1815 %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
1816 %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
1818 %mul = mul <4 x i32> %cvec1, %cvec2
1819 %mul0 = extractelement <4 x i32> %mul, i64 0
1820 %mul1 = extractelement <4 x i32> %mul, i64 1
1821 %mul2 = extractelement <4 x i32> %mul, i64 2
1822 %mul3 = extractelement <4 x i32> %mul, i64 3
1824 %acc = load i32, ptr addrspace(1) %dst, align 4
1825 %add1 = add i32 %mul0, %acc
1826 %add2 = add i32 %add1, %mul1
1827 %add3 = add i32 %add2, %mul2
1828 %add4 = add i32 %add3, %mul3
1830 store i32 %add4, ptr addrspace(1) %dst, align 4
1834 ; TODO: This pattern should be recognized.
1835 define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
1836 ; GFX7-LABEL: udot4_acc16_vecMul:
1837 ; GFX7: ; %bb.0: ; %entry
1838 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1839 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1840 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1841 ; GFX7-NEXT: s_mov_b32 s10, 0
1842 ; GFX7-NEXT: s_mov_b32 s11, s3
1843 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1844 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1845 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1846 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1847 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1848 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1849 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1850 ; GFX7-NEXT: s_mov_b32 s2, -1
1851 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
1852 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1853 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
1854 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2
1855 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1856 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
1857 ; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
1858 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
1859 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
1860 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8
1861 ; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16
1862 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1863 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
1864 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2
1865 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
1866 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0
1867 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
1868 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1
1869 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1870 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0
1871 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1872 ; GFX7-NEXT: s_endpgm
1874 ; GFX8-LABEL: udot4_acc16_vecMul:
1875 ; GFX8: ; %bb.0: ; %entry
1876 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1877 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1878 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1879 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
1880 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1881 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1882 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1883 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1884 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1885 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1886 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1887 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1888 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
1889 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1890 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1891 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
1892 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1893 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
1894 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3
1895 ; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1896 ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3
1897 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1898 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2
1899 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 8, v2
1900 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1901 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2
1902 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1903 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
1904 ; GFX8-NEXT: v_mad_u16 v2, v7, v9, v2
1905 ; GFX8-NEXT: v_mad_u16 v2, v10, v5, v2
1906 ; GFX8-NEXT: v_mad_u16 v2, v6, v8, v2
1907 ; GFX8-NEXT: flat_store_short v[0:1], v2
1908 ; GFX8-NEXT: s_endpgm
1910 ; GFX9-NODL-LABEL: udot4_acc16_vecMul:
1911 ; GFX9-NODL: ; %bb.0: ; %entry
1912 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1913 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1914 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1915 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
1916 ; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100
1917 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1918 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1919 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1920 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1921 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3]
1922 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
1923 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
1924 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1925 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1926 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2
1927 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2
1928 ; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1
1929 ; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1930 ; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2
1931 ; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1932 ; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1
1933 ; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1
1934 ; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1
1935 ; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1
1936 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1937 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5
1938 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1939 ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v2, v3
1940 ; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1941 ; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1
1942 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1943 ; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3]
1944 ; GFX9-NODL-NEXT: s_endpgm
1946 ; GFX9-DL-LABEL: udot4_acc16_vecMul:
1947 ; GFX9-DL: ; %bb.0: ; %entry
1948 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1949 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1950 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1951 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
1952 ; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100
1953 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1954 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1955 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1956 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1957 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
1958 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1959 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
1960 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1961 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1962 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2
1963 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2
1964 ; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1
1965 ; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1966 ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2
1967 ; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1968 ; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1
1969 ; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1
1970 ; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1
1971 ; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1
1972 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1973 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5
1974 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1975 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v2, v3
1976 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1977 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
1978 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1979 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
1980 ; GFX9-DL-NEXT: s_endpgm
1982 ; GFX10-DL-LABEL: udot4_acc16_vecMul:
1983 ; GFX10-DL: ; %bb.0: ; %entry
1984 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1985 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1986 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1987 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff
1988 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1989 ; GFX10-DL-NEXT: s_clause 0x1
1990 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1991 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1992 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1993 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
1994 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1995 ; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1
1996 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1997 ; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v2
1998 ; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v2
1999 ; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v1
2000 ; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100
2001 ; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100
2002 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1
2003 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2
2004 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2005 ; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2006 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
2007 ; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100
2008 ; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
2009 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2011 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
2012 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2013 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
2014 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2015 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
2016 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
2017 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
2018 ; GFX10-DL-NEXT: s_endpgm
2019 ptr addrspace(1) %src2,
2020 ptr addrspace(1) nocapture %dst) {
2022 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2023 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2024 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2025 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2026 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2028 %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
2029 %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
2031 %mul = mul <4 x i16> %cvec1, %cvec2
2032 %mul0 = extractelement <4 x i16> %mul, i64 0
2033 %mul1 = extractelement <4 x i16> %mul, i64 1
2034 %mul2 = extractelement <4 x i16> %mul, i64 2
2035 %mul3 = extractelement <4 x i16> %mul, i64 3
2037 %acc = load i16, ptr addrspace(1) %dst, align 4
2038 %add1 = add i16 %mul0, %acc
2039 %add2 = add i16 %add1, %mul1
2040 %add3 = add i16 %add2, %mul2
2041 %add4 = add i16 %add3, %mul3
2043 store i16 %add4, ptr addrspace(1) %dst, align 4
2047 ; TODO: Support this pattern.
2048 define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
2049 ; GFX7-LABEL: udot4_acc8_vecMul:
2050 ; GFX7: ; %bb.0: ; %entry
2051 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2052 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2053 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2054 ; GFX7-NEXT: s_mov_b32 s10, 0
2055 ; GFX7-NEXT: s_mov_b32 s11, s3
2056 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2057 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2058 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2059 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2060 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2061 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2062 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2063 ; GFX7-NEXT: s_mov_b32 s2, -1
2064 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
2065 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2066 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2
2067 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8
2068 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2069 ; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
2070 ; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8
2071 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2072 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
2073 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
2074 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
2075 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
2076 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
2077 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
2078 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2079 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
2080 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2081 ; GFX7-NEXT: s_endpgm
2083 ; GFX8-LABEL: udot4_acc8_vecMul:
2084 ; GFX8: ; %bb.0: ; %entry
2085 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2086 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2087 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2088 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2089 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2090 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2091 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2092 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2093 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2094 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2095 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2096 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2097 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2098 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2099 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
2100 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2101 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2102 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2103 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
2104 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2105 ; GFX8-NEXT: v_mul_lo_u16_e32 v9, v5, v6
2106 ; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2107 ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2108 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
2109 ; GFX8-NEXT: v_or_b32_e32 v8, v8, v9
2110 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8
2111 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2112 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2113 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
2114 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2115 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
2116 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v7
2117 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2118 ; GFX8-NEXT: s_endpgm
2120 ; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2121 ; GFX9-NODL: ; %bb.0: ; %entry
2122 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2123 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2124 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2125 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2126 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
2127 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
2128 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2129 ; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3]
2130 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
2131 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
2132 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2133 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2134 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2135 ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
2136 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
2137 ; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2138 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
2139 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
2140 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2141 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
2142 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6
2143 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
2144 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8
2145 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3]
2146 ; GFX9-NODL-NEXT: s_endpgm
2148 ; GFX9-DL-LABEL: udot4_acc8_vecMul:
2149 ; GFX9-DL: ; %bb.0: ; %entry
2150 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2151 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2152 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2153 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2154 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2155 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2156 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2157 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
2158 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2159 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
2160 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2161 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2162 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2163 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
2164 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
2165 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2166 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
2167 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
2168 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2169 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
2170 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
2171 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
2172 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8
2173 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
2174 ; GFX9-DL-NEXT: s_endpgm
2176 ; GFX10-DL-LABEL: udot4_acc8_vecMul:
2177 ; GFX10-DL: ; %bb.0: ; %entry
2178 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2179 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2180 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2181 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX10-DL-NEXT: s_clause 0x1
2183 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2184 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2185 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2186 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
2187 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2188 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
2189 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2190 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2
2191 ; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1
2192 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
2193 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2194 ; GFX10-DL-NEXT: v_lshrrev_b16 v9, 8, v2
2195 ; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5
2196 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2197 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
2198 ; GFX10-DL-NEXT: v_mul_lo_u16 v5, v7, v8
2199 ; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9
2200 ; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4
2201 ; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6
2202 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2203 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4
2204 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2205 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5
2206 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
2207 ; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1
2208 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
2209 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
2210 ; GFX10-DL-NEXT: s_endpgm
2211 ptr addrspace(1) %src2,
2212 ptr addrspace(1) nocapture %dst) {
2214 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2215 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2216 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2217 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2218 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2220 %mul = mul <4 x i8> %vec1, %vec2
2221 %mul0 = extractelement <4 x i8> %mul, i64 0
2222 %mul1 = extractelement <4 x i8> %mul, i64 1
2223 %mul2 = extractelement <4 x i8> %mul, i64 2
2224 %mul3 = extractelement <4 x i8> %mul, i64 3
2226 %acc = load i8, ptr addrspace(1) %dst, align 4
2227 %add1 = add i8 %mul0, %acc
2228 %add2 = add i8 %add1, %mul1
2229 %add3 = add i8 %add2, %mul2
2230 %add4 = add i8 %add3, %mul3
2232 store i8 %add4, ptr addrspace(1) %dst, align 4
2236 declare i32 @llvm.amdgcn.workitem.id.x()