1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
9 define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
10 ; GFX7-LABEL: udot4_acc32:
11 ; GFX7: ; %bb.0: ; %entry
12 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
13 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
14 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
15 ; GFX7-NEXT: s_mov_b32 s10, 0
16 ; GFX7-NEXT: s_mov_b32 s11, s3
17 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
19 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
20 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
21 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
22 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
23 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
24 ; GFX7-NEXT: s_movk_i32 s4, 0xff
25 ; GFX7-NEXT: s_mov_b32 s2, -1
26 ; GFX7-NEXT: s_waitcnt vmcnt(1)
27 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
28 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
29 ; GFX7-NEXT: s_waitcnt vmcnt(0)
30 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
31 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
32 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
33 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
34 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
35 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
36 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4
38 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
39 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
40 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
41 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
42 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
45 ; GFX8-LABEL: udot4_acc32:
46 ; GFX8: ; %bb.0: ; %entry
47 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
48 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
49 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
50 ; GFX8-NEXT: s_movk_i32 s2, 0xff
51 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
53 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
54 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
55 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
56 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
57 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
58 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
59 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
60 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
61 ; GFX8-NEXT: s_waitcnt vmcnt(1)
62 ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3
63 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
64 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
65 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
66 ; GFX8-NEXT: s_waitcnt vmcnt(0)
67 ; GFX8-NEXT: v_and_b32_e32 v2, s2, v0
68 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
69 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s3
71 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
72 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
73 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
74 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
75 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
76 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
77 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
78 ; GFX8-NEXT: flat_store_dword v[0:1], v2
81 ; GFX9-NODL-LABEL: udot4_acc32:
82 ; GFX9-NODL: ; %bb.0: ; %entry
83 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
84 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
85 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
86 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
87 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
88 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
89 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
90 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
91 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
92 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
93 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
94 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
95 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
96 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
98 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
99 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
100 ; GFX9-NODL-NEXT: s_endpgm
102 ; GFX9-DL-LABEL: udot4_acc32:
103 ; GFX9-DL: ; %bb.0: ; %entry
104 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
105 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
106 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
107 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
108 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
110 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
111 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
112 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
113 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, s0
114 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
115 ; GFX9-DL-NEXT: s_endpgm
117 ; GFX10-DL-LABEL: udot4_acc32:
118 ; GFX10-DL: ; %bb.0: ; %entry
119 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
120 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
121 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
122 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX10-DL-NEXT: s_clause 0x1
124 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
125 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
126 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
127 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
128 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
129 ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
130 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
131 ; GFX10-DL-NEXT: s_endpgm
132 <4 x i8> addrspace(1)* %src2,
133 i32 addrspace(1)* nocapture %dst) {
135 %idx = call i32 @llvm.amdgcn.workitem.id.x()
136 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
137 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
138 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
139 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
141 %v1e0 = extractelement <4 x i8> %vec1, i64 0
142 %cv1e0 = zext i8 %v1e0 to i32
143 %v2e0 = extractelement <4 x i8> %vec2, i64 0
144 %cv2e0 = zext i8 %v2e0 to i32
145 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
147 %v1e1 = extractelement <4 x i8> %vec1, i64 1
148 %cv1e1 = zext i8 %v1e1 to i32
149 %v2e1 = extractelement <4 x i8> %vec2, i64 1
150 %cv2e1 = zext i8 %v2e1 to i32
151 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
153 %v1e2 = extractelement <4 x i8> %vec1, i64 2
154 %cv1e2 = zext i8 %v1e2 to i32
155 %v2e2 = extractelement <4 x i8> %vec2, i64 2
156 %cv2e2 = zext i8 %v2e2 to i32
157 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
159 %v1e3 = extractelement <4 x i8> %vec1, i64 3
160 %cv1e3 = zext i8 %v1e3 to i32
161 %v2e3 = extractelement <4 x i8> %vec2, i64 3
162 %cv2e3 = zext i8 %v2e3 to i32
163 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
165 %acc = load i32, i32 addrspace(1)* %dst, align 4
166 %mad1 = add i32 %mul1, %acc
167 %mad2 = add i32 %mad1, %mul2
168 %mad3 = add i32 %mad2, %mul3
169 %mad4 = add i32 %mad3, %mul4
171 store i32 %mad4, i32 addrspace(1)* %dst, align 4
175 define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
176 ; GFX7-LABEL: udot4_acc16:
177 ; GFX7: ; %bb.0: ; %entry
178 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
179 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
180 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
181 ; GFX7-NEXT: s_mov_b32 s10, 0
182 ; GFX7-NEXT: s_mov_b32 s11, s3
183 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
185 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
186 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
187 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
188 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
189 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
190 ; GFX7-NEXT: s_mov_b32 s2, -1
191 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0
192 ; GFX7-NEXT: s_movk_i32 s4, 0xff
193 ; GFX7-NEXT: s_waitcnt vmcnt(2)
194 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
195 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
196 ; GFX7-NEXT: s_waitcnt vmcnt(1)
197 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
198 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
199 ; GFX7-NEXT: s_waitcnt vmcnt(0)
200 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8
201 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
202 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
203 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
204 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
205 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
206 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
207 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
208 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
209 ; GFX7-NEXT: s_endpgm
211 ; GFX8-LABEL: udot4_acc16:
212 ; GFX8: ; %bb.0: ; %entry
213 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
214 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
215 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
216 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
218 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
219 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
220 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
221 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
222 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
223 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
224 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
225 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
226 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
227 ; GFX8-NEXT: flat_load_ushort v1, v[2:3]
228 ; GFX8-NEXT: s_movk_i32 s0, 0xff
229 ; GFX8-NEXT: v_mov_b32_e32 v5, s0
230 ; GFX8-NEXT: s_waitcnt vmcnt(2)
231 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4
232 ; GFX8-NEXT: v_and_b32_e32 v6, s0, v4
233 ; GFX8-NEXT: v_and_b32_e32 v8, s0, v8
234 ; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
235 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
236 ; GFX8-NEXT: s_waitcnt vmcnt(1)
237 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0
238 ; GFX8-NEXT: v_and_b32_e32 v7, s0, v0
239 ; GFX8-NEXT: v_and_b32_e32 v9, s0, v9
240 ; GFX8-NEXT: s_waitcnt vmcnt(0)
241 ; GFX8-NEXT: v_mad_u16 v1, v6, v7, v1
242 ; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
243 ; GFX8-NEXT: v_mad_u16 v1, v8, v9, v1
244 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
245 ; GFX8-NEXT: v_mad_u16 v1, v10, v5, v1
246 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1
247 ; GFX8-NEXT: flat_store_short v[2:3], v0
248 ; GFX8-NEXT: s_endpgm
250 ; GFX9-NODL-LABEL: udot4_acc16:
251 ; GFX9-NODL: ; %bb.0: ; %entry
252 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
253 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
254 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
255 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
256 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
257 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
259 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
260 ; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3]
261 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
262 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
263 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
264 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
265 ; GFX9-NODL-NEXT: v_and_b32_e32 v0, s0, v2
266 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v3
267 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6
268 ; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7
269 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
270 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4
271 ; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
272 ; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
273 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v6, v7, v0
274 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
275 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
276 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0
277 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
278 ; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3]
279 ; GFX9-NODL-NEXT: s_endpgm
281 ; GFX9-DL-LABEL: udot4_acc16:
282 ; GFX9-DL: ; %bb.0: ; %entry
283 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
284 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
285 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
286 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
287 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
288 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
290 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
291 ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3]
292 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
293 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
294 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
295 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
296 ; GFX9-DL-NEXT: v_and_b32_e32 v0, s0, v2
297 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v3
298 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6
299 ; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7
300 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
301 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4
302 ; GFX9-DL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
303 ; GFX9-DL-NEXT: v_and_b32_sdwa v9, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v7, v0
305 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
306 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
307 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0
308 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
309 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3]
310 ; GFX9-DL-NEXT: s_endpgm
312 ; GFX10-DL-LABEL: udot4_acc16:
313 ; GFX10-DL: ; %bb.0: ; %entry
314 ; GFX10-DL-NEXT: s_clause 0x1
315 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
316 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
317 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
318 ; GFX10-DL-NEXT: s_movk_i32 s0, 0xff
319 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX10-DL-NEXT: s_clause 0x1
321 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
322 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
323 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
324 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
325 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
326 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
327 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
328 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
329 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1
330 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2
331 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5
332 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6
333 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
334 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
335 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
336 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
337 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
338 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
339 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3
340 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
341 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
342 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
343 ; GFX10-DL-NEXT: s_endpgm
344 <4 x i8> addrspace(1)* %src2,
345 i16 addrspace(1)* nocapture %dst) {
347 %idx = call i32 @llvm.amdgcn.workitem.id.x()
348 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
349 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
350 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
351 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
353 %v1e0 = extractelement <4 x i8> %vec1, i64 0
354 %cv1e0 = zext i8 %v1e0 to i16
355 %v2e0 = extractelement <4 x i8> %vec2, i64 0
356 %cv2e0 = zext i8 %v2e0 to i16
357 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
359 %v1e1 = extractelement <4 x i8> %vec1, i64 1
360 %cv1e1 = zext i8 %v1e1 to i16
361 %v2e1 = extractelement <4 x i8> %vec2, i64 1
362 %cv2e1 = zext i8 %v2e1 to i16
363 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
365 %v1e2 = extractelement <4 x i8> %vec1, i64 2
366 %cv1e2 = zext i8 %v1e2 to i16
367 %v2e2 = extractelement <4 x i8> %vec2, i64 2
368 %cv2e2 = zext i8 %v2e2 to i16
369 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
371 %v1e3 = extractelement <4 x i8> %vec1, i64 3
372 %cv1e3 = zext i8 %v1e3 to i16
373 %v2e3 = extractelement <4 x i8> %vec2, i64 3
374 %cv2e3 = zext i8 %v2e3 to i16
375 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
377 %acc = load i16, i16 addrspace(1)* %dst, align 2
378 %mad1 = add i16 %mul1, %acc
379 %mad2 = add i16 %mad1, %mul2
380 %mad3 = add i16 %mad2, %mul3
381 %mad4 = add i16 %mad3, %mul4
383 store i16 %mad4, i16 addrspace(1)* %dst, align 2
387 define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
388 ; GFX7-LABEL: udot4_acc8:
389 ; GFX7: ; %bb.0: ; %entry
390 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
391 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
392 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
393 ; GFX7-NEXT: s_mov_b32 s10, 0
394 ; GFX7-NEXT: s_mov_b32 s11, s3
395 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
397 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
398 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
399 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
400 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
401 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
402 ; GFX7-NEXT: s_mov_b32 s2, -1
403 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0
404 ; GFX7-NEXT: s_movk_i32 s4, 0xff
405 ; GFX7-NEXT: s_waitcnt vmcnt(2)
406 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
407 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
408 ; GFX7-NEXT: s_waitcnt vmcnt(1)
409 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
410 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
411 ; GFX7-NEXT: s_waitcnt vmcnt(0)
412 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8
413 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
414 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
415 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
416 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
417 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
418 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
419 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
420 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
421 ; GFX7-NEXT: s_endpgm
423 ; GFX8-LABEL: udot4_acc8:
424 ; GFX8: ; %bb.0: ; %entry
425 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
426 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
427 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
428 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
429 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
430 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
431 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
432 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
433 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
434 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
435 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
436 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
437 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
438 ; GFX8-NEXT: flat_load_ubyte v5, v[2:3]
439 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
440 ; GFX8-NEXT: s_waitcnt vmcnt(2)
441 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
442 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
443 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4
444 ; GFX8-NEXT: s_waitcnt vmcnt(0)
445 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
446 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0
447 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0
448 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v5
449 ; GFX8-NEXT: v_mad_u16 v0, v7, v8, v0
450 ; GFX8-NEXT: v_mad_u16 v0, v1, v6, v0
451 ; GFX8-NEXT: v_mad_u16 v0, v9, v10, v0
452 ; GFX8-NEXT: flat_store_byte v[2:3], v0
453 ; GFX8-NEXT: s_endpgm
455 ; GFX9-NODL-LABEL: udot4_acc8:
456 ; GFX9-NODL: ; %bb.0: ; %entry
457 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
458 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
459 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
460 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
461 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
462 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
463 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
464 ; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3]
465 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
466 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
467 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
468 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2
469 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
470 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
471 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
472 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4
473 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
474 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v6, v7, v2
475 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v3
476 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2
477 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0
478 ; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3]
479 ; GFX9-NODL-NEXT: s_endpgm
481 ; GFX9-DL-LABEL: udot4_acc8:
482 ; GFX9-DL: ; %bb.0: ; %entry
483 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
484 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
485 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
486 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
487 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
489 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
490 ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
491 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
492 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
493 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
494 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2
495 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
496 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
497 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
498 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4
499 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
500 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v6, v7, v2
501 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v3
502 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2
503 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v9, v0
504 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
505 ; GFX9-DL-NEXT: s_endpgm
507 ; GFX10-DL-LABEL: udot4_acc8:
508 ; GFX10-DL: ; %bb.0: ; %entry
509 ; GFX10-DL-NEXT: s_clause 0x1
510 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
511 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
512 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
513 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
514 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX10-DL-NEXT: s_clause 0x1
516 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
517 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
518 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
519 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
520 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
521 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
522 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
523 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
524 ; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4
525 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
526 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
527 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
528 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
529 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
530 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
531 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
532 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
533 ; GFX10-DL-NEXT: s_endpgm
534 <4 x i8> addrspace(1)* %src2,
535 i8 addrspace(1)* nocapture %dst) {
537 %idx = call i32 @llvm.amdgcn.workitem.id.x()
538 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
539 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
540 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
541 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
543 %v1e0 = extractelement <4 x i8> %vec1, i64 0
544 %v2e0 = extractelement <4 x i8> %vec2, i64 0
545 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
547 %v1e1 = extractelement <4 x i8> %vec1, i64 1
548 %v2e1 = extractelement <4 x i8> %vec2, i64 1
549 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
551 %v1e2 = extractelement <4 x i8> %vec1, i64 2
552 %v2e2 = extractelement <4 x i8> %vec2, i64 2
553 %mul3 = mul nuw nsw i8 %v1e2, %v2e2
555 %v1e3 = extractelement <4 x i8> %vec1, i64 3
556 %v2e3 = extractelement <4 x i8> %vec2, i64 3
557 %mul4 = mul nuw nsw i8 %v1e3, %v2e3
559 %acc = load i8, i8 addrspace(1)* %dst, align 2
560 %mad1 = add i8 %mul1, %acc
561 %mad2 = add i8 %mad1, %mul2
562 %mad3 = add i8 %mad2, %mul3
563 %mad4 = add i8 %mad3, %mul4
565 store i8 %mad4, i8 addrspace(1)* %dst, align 2
569 ; TODO: Generate udot4?
570 define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
571 ; GFX7-LABEL: udot2_8:
572 ; GFX7: ; %bb.0: ; %entry
573 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
574 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
575 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
576 ; GFX7-NEXT: s_mov_b32 s10, 0
577 ; GFX7-NEXT: s_mov_b32 s11, s3
578 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
580 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
581 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
582 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
583 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
584 ; GFX7-NEXT: s_mov_b32 s2, -1
585 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
586 ; GFX7-NEXT: buffer_load_ubyte v4, off, s[0:3], 0
587 ; GFX7-NEXT: s_movk_i32 s4, 0xff
588 ; GFX7-NEXT: s_waitcnt vmcnt(2)
589 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
590 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
591 ; GFX7-NEXT: s_waitcnt vmcnt(1)
592 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v0
593 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
594 ; GFX7-NEXT: s_waitcnt vmcnt(0)
595 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4
596 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
597 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
598 ; GFX7-NEXT: s_endpgm
600 ; GFX8-LABEL: udot2_8:
601 ; GFX8: ; %bb.0: ; %entry
602 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
603 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
604 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
605 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
606 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
607 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
608 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
609 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
610 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
611 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
612 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
613 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
614 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
615 ; GFX8-NEXT: flat_load_ubyte v5, v[2:3]
616 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
617 ; GFX8-NEXT: s_waitcnt vmcnt(2)
618 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v4
619 ; GFX8-NEXT: s_waitcnt vmcnt(0)
620 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0
621 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v5
622 ; GFX8-NEXT: v_mad_u16 v0, v1, v6, v0
623 ; GFX8-NEXT: flat_store_byte v[2:3], v0
624 ; GFX8-NEXT: s_endpgm
626 ; GFX9-NODL-LABEL: udot2_8:
627 ; GFX9-NODL: ; %bb.0: ; %entry
628 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
629 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
630 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
631 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
632 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
634 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
635 ; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3]
636 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
637 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
638 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
639 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
640 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
641 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4
642 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2
643 ; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3]
644 ; GFX9-NODL-NEXT: s_endpgm
646 ; GFX9-DL-LABEL: udot2_8:
647 ; GFX9-DL: ; %bb.0: ; %entry
648 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
649 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
650 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
651 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
652 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
654 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
655 ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
656 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
657 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
658 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
659 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
660 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
661 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4
662 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2
663 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
664 ; GFX9-DL-NEXT: s_endpgm
666 ; GFX10-DL-LABEL: udot2_8:
667 ; GFX10-DL: ; %bb.0: ; %entry
668 ; GFX10-DL-NEXT: s_clause 0x1
669 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
670 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
671 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
672 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
673 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
674 ; GFX10-DL-NEXT: s_clause 0x1
675 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
676 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
677 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
678 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
679 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
680 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
681 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
682 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
683 ; GFX10-DL-NEXT: v_mad_u16 v2, v2, v3, v4
684 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v2
685 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
686 ; GFX10-DL-NEXT: s_endpgm
687 <4 x i8> addrspace(1)* %src2,
688 i8 addrspace(1)* nocapture %dst) {
690 %idx = call i32 @llvm.amdgcn.workitem.id.x()
691 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
692 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
693 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
694 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
696 %v1e0 = extractelement <4 x i8> %vec1, i64 0
697 %v2e0 = extractelement <4 x i8> %vec2, i64 0
698 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
700 %v1e1 = extractelement <4 x i8> %vec1, i64 1
701 %v2e1 = extractelement <4 x i8> %vec2, i64 1
702 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
704 %acc = load i8, i8 addrspace(1)* %dst, align 2
705 %mad1 = add i8 %mul1, %acc
706 %mad2 = add i8 %mad1, %mul2
707 store i8 %mad2, i8 addrspace(1)* %dst, align 2
711 define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
712 ; GFX7-LABEL: udot4_CommutationInsideMAD:
713 ; GFX7: ; %bb.0: ; %entry
714 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
715 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
716 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
717 ; GFX7-NEXT: s_mov_b32 s10, 0
718 ; GFX7-NEXT: s_mov_b32 s11, s3
719 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
720 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
721 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
722 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
723 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
724 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
725 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
726 ; GFX7-NEXT: s_mov_b32 s2, -1
727 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0
728 ; GFX7-NEXT: s_movk_i32 s4, 0xff
729 ; GFX7-NEXT: s_waitcnt vmcnt(2)
730 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
731 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
732 ; GFX7-NEXT: s_waitcnt vmcnt(1)
733 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
734 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
735 ; GFX7-NEXT: s_waitcnt vmcnt(0)
736 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v8
737 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
738 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
739 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
740 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
741 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
742 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
743 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
744 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
745 ; GFX7-NEXT: s_endpgm
747 ; GFX8-LABEL: udot4_CommutationInsideMAD:
748 ; GFX8: ; %bb.0: ; %entry
749 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
750 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
751 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
752 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
754 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
755 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
756 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
757 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
758 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
759 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
760 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
761 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
762 ; GFX8-NEXT: flat_load_ubyte v5, v[2:3]
763 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
764 ; GFX8-NEXT: s_waitcnt vmcnt(2)
765 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
766 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
767 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4
768 ; GFX8-NEXT: s_waitcnt vmcnt(0)
769 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
770 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0
771 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0
772 ; GFX8-NEXT: v_mad_u16 v0, v0, v4, v5
773 ; GFX8-NEXT: v_mad_u16 v0, v8, v7, v0
774 ; GFX8-NEXT: v_mad_u16 v0, v6, v1, v0
775 ; GFX8-NEXT: v_mad_u16 v0, v10, v9, v0
776 ; GFX8-NEXT: flat_store_byte v[2:3], v0
777 ; GFX8-NEXT: s_endpgm
779 ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
780 ; GFX9-NODL: ; %bb.0: ; %entry
781 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
782 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
783 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
784 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
785 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
787 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
788 ; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3]
789 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
790 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
791 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
792 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2
793 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
794 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
795 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
796 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v3, v2, v4
797 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
798 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v7, v6, v2
799 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v3
800 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v5, v0, v2
801 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v9, v8, v0
802 ; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3]
803 ; GFX9-NODL-NEXT: s_endpgm
805 ; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
806 ; GFX9-DL: ; %bb.0: ; %entry
807 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
808 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
809 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
810 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
811 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
812 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
813 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
814 ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
815 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
816 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
817 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
818 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2
819 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
820 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
821 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
822 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v3, v2, v4
823 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
824 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v7, v6, v2
825 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 24, v3
826 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v0, v2
827 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v8, v0
828 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
829 ; GFX9-DL-NEXT: s_endpgm
831 ; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
832 ; GFX10-DL: ; %bb.0: ; %entry
833 ; GFX10-DL-NEXT: s_clause 0x1
834 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
835 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
836 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
837 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
838 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
839 ; GFX10-DL-NEXT: s_clause 0x1
840 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
841 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
842 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
843 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
844 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
845 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
846 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
847 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
848 ; GFX10-DL-NEXT: v_mad_u16 v4, v3, v2, v4
849 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
850 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
851 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
852 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
853 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4
854 ; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0
855 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
856 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
857 ; GFX10-DL-NEXT: s_endpgm
858 <4 x i8> addrspace(1)* %src2,
859 i8 addrspace(1)* nocapture %dst) {
861 %idx = call i32 @llvm.amdgcn.workitem.id.x()
862 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
863 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
864 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
865 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
867 %v1e0 = extractelement <4 x i8> %vec1, i64 0
868 %v2e0 = extractelement <4 x i8> %vec2, i64 0
869 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
871 %v1e1 = extractelement <4 x i8> %vec1, i64 1
872 %v2e1 = extractelement <4 x i8> %vec2, i64 1
873 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
875 %v1e2 = extractelement <4 x i8> %vec1, i64 2
876 %v2e2 = extractelement <4 x i8> %vec2, i64 2
877 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
879 %v1e3 = extractelement <4 x i8> %vec1, i64 3
880 %v2e3 = extractelement <4 x i8> %vec2, i64 3
881 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
883 %acc = load i8, i8 addrspace(1)* %dst, align 2
884 %mad1 = add i8 %acc, %mul1
885 %mad2 = add i8 %mul2, %mad1
886 %mad3 = add i8 %mul3, %mad2
887 %mad4 = add i8 %mul4, %mad3
889 store i8 %mad4, i8 addrspace(1)* %dst, align 2
893 ; TODO: Support commutation accross the adds.
894 define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
895 ; GFX7-LABEL: udot4_CommutationAccrossMADs:
896 ; GFX7: ; %bb.0: ; %entry
897 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
898 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
899 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
900 ; GFX7-NEXT: s_mov_b32 s10, 0
901 ; GFX7-NEXT: s_mov_b32 s11, s3
902 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
903 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
904 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
905 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
906 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
907 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
908 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
909 ; GFX7-NEXT: s_mov_b32 s2, -1
910 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0
911 ; GFX7-NEXT: s_movk_i32 s4, 0xff
912 ; GFX7-NEXT: s_waitcnt vmcnt(2)
913 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
914 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
915 ; GFX7-NEXT: s_waitcnt vmcnt(1)
916 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
917 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
918 ; GFX7-NEXT: s_waitcnt vmcnt(0)
919 ; GFX7-NEXT: v_mad_u32_u24 v3, v6, v3, v8
920 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
921 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
922 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v3
923 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
924 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
925 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
926 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
927 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
928 ; GFX7-NEXT: s_endpgm
930 ; GFX8-LABEL: udot4_CommutationAccrossMADs:
931 ; GFX8: ; %bb.0: ; %entry
932 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
933 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
934 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
935 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
936 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
937 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
938 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
939 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
940 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
941 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
942 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
943 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
944 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
945 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
946 ; GFX8-NEXT: flat_load_ubyte v10, v[2:3]
947 ; GFX8-NEXT: s_waitcnt vmcnt(2)
948 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v4
949 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
950 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v4
951 ; GFX8-NEXT: s_waitcnt vmcnt(1)
952 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0
953 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
954 ; GFX8-NEXT: s_waitcnt vmcnt(0)
955 ; GFX8-NEXT: v_mad_u16 v6, v7, v6, v10
956 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0
957 ; GFX8-NEXT: v_mad_u16 v0, v0, v4, v6
958 ; GFX8-NEXT: v_mad_u16 v0, v5, v1, v0
959 ; GFX8-NEXT: v_mad_u16 v0, v9, v8, v0
960 ; GFX8-NEXT: flat_store_byte v[2:3], v0
961 ; GFX8-NEXT: s_endpgm
963 ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
964 ; GFX9-NODL: ; %bb.0: ; %entry
965 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
966 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
967 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
968 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
969 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
970 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
971 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
972 ; GFX9-NODL-NEXT: global_load_ubyte v9, v1, s[2:3]
973 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
974 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
975 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
976 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v3
977 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
978 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v5, v6, v5, v9
979 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
980 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2
981 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
982 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v3, v2, v5
983 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v3
984 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v4, v0, v2
985 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v8, v7, v0
986 ; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3]
987 ; GFX9-NODL-NEXT: s_endpgm
989 ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
990 ; GFX9-DL: ; %bb.0: ; %entry
991 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
992 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
993 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
994 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
995 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
996 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
997 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
998 ; GFX9-DL-NEXT: global_load_ubyte v9, v1, s[2:3]
999 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1000 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
1001 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1002 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3
1003 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1004 ; GFX9-DL-NEXT: v_mad_legacy_u16 v5, v6, v5, v9
1005 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1006 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2
1007 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
1008 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v3, v2, v5
1009 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v3
1010 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v4, v0, v2
1011 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v8, v7, v0
1012 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3]
1013 ; GFX9-DL-NEXT: s_endpgm
1015 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
1016 ; GFX10-DL: ; %bb.0: ; %entry
1017 ; GFX10-DL-NEXT: s_clause 0x1
1018 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1019 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1020 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1021 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
1022 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1023 ; GFX10-DL-NEXT: s_clause 0x1
1024 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
1025 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
1026 ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
1027 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1028 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
1029 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1030 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
1031 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1032 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4
1033 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1034 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
1035 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
1036 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1037 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1038 ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0
1039 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
1040 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
1041 ; GFX10-DL-NEXT: s_endpgm
1042 <4 x i8> addrspace(1)* %src2,
1043 i8 addrspace(1)* nocapture %dst) {
1045 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1046 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1047 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1048 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1049 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1051 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1052 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1053 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
1055 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1056 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1057 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
1059 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1060 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1061 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
1063 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1064 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1065 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
1067 %acc = load i8, i8 addrspace(1)* %dst, align 2
1068 %mad1 = add i8 %acc, %mul2
1069 %mad2 = add i8 %mad1, %mul1
1070 %mad3 = add i8 %mad2, %mul3
1071 %mad4 = add i8 %mad3, %mul4
1073 store i8 %mad4, i8 addrspace(1)* %dst, align 2
1077 define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
1078 ; GFX7-LABEL: udot4_multiuse_mul1:
1079 ; GFX7: ; %bb.0: ; %entry
1080 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1081 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1082 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1083 ; GFX7-NEXT: s_mov_b32 s10, 0
1084 ; GFX7-NEXT: s_mov_b32 s11, s3
1085 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1086 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1087 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1088 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1089 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1090 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1091 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1092 ; GFX7-NEXT: s_movk_i32 s4, 0xff
1093 ; GFX7-NEXT: s_mov_b32 s2, -1
1094 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1095 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
1096 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
1097 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1098 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
1099 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1100 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
1101 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1102 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1103 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1104 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1105 ; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4
1106 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8
1107 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
1108 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1109 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1110 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1111 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1112 ; GFX7-NEXT: s_endpgm
1114 ; GFX8-LABEL: udot4_multiuse_mul1:
1115 ; GFX8: ; %bb.0: ; %entry
1116 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1117 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1118 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1119 ; GFX8-NEXT: s_movk_i32 s2, 0xff
1120 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1121 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1122 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1123 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1124 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1125 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1126 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1127 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1128 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1129 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
1130 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1131 ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3
1132 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
1133 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
1134 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1135 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1136 ; GFX8-NEXT: v_and_b32_e32 v2, s2, v0
1137 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
1138 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1139 ; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s3
1140 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8
1141 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
1142 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
1143 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1144 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
1145 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
1146 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1147 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1148 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1149 ; GFX8-NEXT: s_endpgm
1151 ; GFX9-NODL-LABEL: udot4_multiuse_mul1:
1152 ; GFX9-NODL: ; %bb.0: ; %entry
1153 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1154 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1155 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1156 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
1157 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1158 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1159 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1160 ; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0
1161 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1162 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1163 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1
1164 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1165 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2
1166 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1167 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1168 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1169 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4
1170 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1171 ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s1
1172 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2
1173 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
1174 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
1175 ; GFX9-NODL-NEXT: s_endpgm
1177 ; GFX9-DL-LABEL: udot4_multiuse_mul1:
1178 ; GFX9-DL: ; %bb.0: ; %entry
1179 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1180 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1181 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1182 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
1183 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1184 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1185 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1186 ; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0
1187 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1188 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1189 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v1
1190 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1191 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v2
1192 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1193 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1194 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1195 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4
1196 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1197 ; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s1
1198 ; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2
1199 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1
1200 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1201 ; GFX9-DL-NEXT: s_endpgm
1203 ; GFX10-DL-LABEL: udot4_multiuse_mul1:
1204 ; GFX10-DL: ; %bb.0: ; %entry
1205 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1206 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1207 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1208 ; GFX10-DL-NEXT: s_movk_i32 s3, 0xff
1209 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1210 ; GFX10-DL-NEXT: s_clause 0x1
1211 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1212 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1213 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1214 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1215 ; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1
1216 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1217 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2
1218 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1219 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3
1220 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1221 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
1222 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1223 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1224 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1225 ; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5
1226 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
1227 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
1228 ; GFX10-DL-NEXT: s_endpgm
1229 <4 x i8> addrspace(1)* %src2,
1230 i32 addrspace(1)* nocapture %dst) {
1232 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1233 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1234 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1235 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1236 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1238 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1239 %cv1e0 = zext i8 %v1e0 to i32
1240 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1241 %cv2e0 = zext i8 %v2e0 to i32
1242 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1244 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1245 %cv1e1 = zext i8 %v1e1 to i32
1246 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1247 %cv2e1 = zext i8 %v2e1 to i32
1248 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1250 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1251 %cv1e2 = zext i8 %v1e2 to i32
1252 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1253 %cv2e2 = zext i8 %v2e2 to i32
1254 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1256 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1257 %cv1e3 = zext i8 %v1e3 to i32
1258 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1259 %cv2e3 = zext i8 %v2e3 to i32
1260 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1262 %acc = load i32, i32 addrspace(1)* %dst, align 4
1263 %add = add i32 %mul1, %acc
1264 %add1 = add i32 %mul2, %add
1265 %add2 = add i32 %add1, %mul1
1266 %add3 = add i32 %add2, %mul3
1267 %add4 = add i32 %add3, %mul4
1269 store i32 %add4, i32 addrspace(1)* %dst, align 4
1273 define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
1274 ; GFX7-LABEL: udot4_multiuse_add1:
1275 ; GFX7: ; %bb.0: ; %entry
1276 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1277 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1278 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1279 ; GFX7-NEXT: s_mov_b32 s10, 0
1280 ; GFX7-NEXT: s_mov_b32 s11, s3
1281 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1282 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1283 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1284 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1285 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1286 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1287 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1288 ; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0
1289 ; GFX7-NEXT: s_movk_i32 s4, 0xff
1290 ; GFX7-NEXT: s_mov_b32 s2, -1
1291 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1292 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
1293 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
1294 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1295 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
1296 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1297 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s5
1298 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
1299 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1300 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1301 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
1302 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1303 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1304 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1305 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, s5, v3
1306 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1307 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6
1308 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1309 ; GFX7-NEXT: s_endpgm
1311 ; GFX8-LABEL: udot4_multiuse_add1:
1312 ; GFX8: ; %bb.0: ; %entry
1313 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1314 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1315 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1316 ; GFX8-NEXT: s_movk_i32 s2, 0xff
1317 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1319 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1320 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1321 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1322 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1323 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1324 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1325 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1326 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
1327 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1328 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
1329 ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3
1330 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8
1331 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1332 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1333 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
1334 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1335 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s3
1336 ; GFX8-NEXT: v_and_b32_e32 v2, s2, v0
1337 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
1338 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
1339 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1340 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
1341 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s3, v4
1342 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1
1343 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5
1344 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1345 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1346 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1347 ; GFX8-NEXT: s_endpgm
1349 ; GFX9-NODL-LABEL: udot4_multiuse_add1:
1350 ; GFX9-NODL: ; %bb.0: ; %entry
1351 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1352 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1353 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1354 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1355 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1356 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1357 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
1358 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1359 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1360 ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
1361 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1362 ; GFX9-NODL-NEXT: v_bfe_u32 v5, v2, 8, 8
1363 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1364 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1365 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1366 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1367 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0
1368 ; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2
1369 ; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6
1370 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4
1371 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
1372 ; GFX9-NODL-NEXT: s_endpgm
1374 ; GFX9-DL-LABEL: udot4_multiuse_add1:
1375 ; GFX9-DL: ; %bb.0: ; %entry
1376 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1377 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1378 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1379 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1380 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1381 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1382 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1383 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1384 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1385 ; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 8, 8
1386 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1387 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 8
1388 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1389 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1390 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1391 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1392 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, s0
1393 ; GFX9-DL-NEXT: v_add_u32_e32 v4, s0, v2
1394 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v3, v6
1395 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v1, v4
1396 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1397 ; GFX9-DL-NEXT: s_endpgm
1399 ; GFX10-DL-LABEL: udot4_multiuse_add1:
1400 ; GFX10-DL: ; %bb.0: ; %entry
1401 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1402 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1403 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1404 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1405 ; GFX10-DL-NEXT: s_clause 0x1
1406 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1407 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1408 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1409 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1410 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 8, 8
1411 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1412 ; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8
1413 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1414 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1415 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
1416 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1417 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1418 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0
1419 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3
1420 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
1421 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
1422 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
1423 ; GFX10-DL-NEXT: s_endpgm
1424 <4 x i8> addrspace(1)* %src2,
1425 i32 addrspace(1)* nocapture %dst) {
1427 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1428 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1429 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1430 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1431 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1433 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1434 %cv1e0 = zext i8 %v1e0 to i32
1435 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1436 %cv2e0 = zext i8 %v2e0 to i32
1437 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1439 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1440 %cv1e1 = zext i8 %v1e1 to i32
1441 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1442 %cv2e1 = zext i8 %v2e1 to i32
1443 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1445 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1446 %cv1e2 = zext i8 %v1e2 to i32
1447 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1448 %cv2e2 = zext i8 %v2e2 to i32
1449 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1451 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1452 %cv1e3 = zext i8 %v1e3 to i32
1453 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1454 %cv2e3 = zext i8 %v2e3 to i32
1455 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1457 %acc = load i32, i32 addrspace(1)* %dst, align 4
1458 %add1 = add i32 %mul2, %acc
1459 %add = add i32 %add1, %acc
1460 %add2 = add i32 %add1, %mul1
1461 %add3 = add i32 %add2, %mul3
1462 %add4 = add i32 %add3, %mul4
1463 %res = add i32 %add4, %add
1464 store i32 %res, i32 addrspace(1)* %dst, align 4
1468 define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
1469 ; GFX7-LABEL: notdot4_mixedtypes:
1470 ; GFX7: ; %bb.0: ; %entry
1471 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1472 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1473 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1474 ; GFX7-NEXT: s_mov_b32 s10, 0
1475 ; GFX7-NEXT: s_mov_b32 s11, s3
1476 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1477 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1478 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1479 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1480 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1481 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1482 ; GFX7-NEXT: s_mov_b32 s2, -1
1483 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1484 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
1485 ; GFX7-NEXT: s_mov_b32 s4, 0xffff
1486 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1487 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
1488 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
1489 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1490 ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8
1491 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
1492 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
1493 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
1494 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1495 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1496 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
1497 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
1498 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
1499 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1500 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1501 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
1502 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1503 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1504 ; GFX7-NEXT: s_endpgm
1506 ; GFX8-LABEL: notdot4_mixedtypes:
1507 ; GFX8: ; %bb.0: ; %entry
1508 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1509 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1510 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1511 ; GFX8-NEXT: s_movk_i32 s2, 0xff
1512 ; GFX8-NEXT: v_mov_b32_e32 v5, s2
1513 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1514 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1515 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1516 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1517 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
1518 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1519 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1520 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1521 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1522 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1523 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1524 ; GFX8-NEXT: flat_load_ushort v10, v[2:3]
1525 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1526 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
1527 ; GFX8-NEXT: v_and_b32_e32 v7, s2, v7
1528 ; GFX8-NEXT: v_bfe_i32 v1, v4, 0, 8
1529 ; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1530 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1531 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0
1532 ; GFX8-NEXT: v_and_b32_e32 v8, s2, v8
1533 ; GFX8-NEXT: v_bfe_i32 v6, v0, 0, 8
1534 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1535 ; GFX8-NEXT: v_mad_u16 v7, v7, v8, v10
1536 ; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1537 ; GFX8-NEXT: v_mad_u16 v1, v1, v6, v7
1538 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
1539 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1540 ; GFX8-NEXT: v_mad_u16 v1, v9, v5, v1
1541 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1
1542 ; GFX8-NEXT: flat_store_short v[2:3], v0
1543 ; GFX8-NEXT: s_endpgm
1545 ; GFX9-NODL-LABEL: notdot4_mixedtypes:
1546 ; GFX9-NODL: ; %bb.0: ; %entry
1547 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1548 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1549 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1550 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
1551 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
1552 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1553 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
1554 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
1555 ; GFX9-NODL-NEXT: global_load_ushort v9, v1, s[2:3]
1556 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
1557 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
1558 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1559 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v3
1560 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v5
1561 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6
1562 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v2, 0, 8
1563 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v3, 0, 8
1564 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1565 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v5, v5, v6, v9
1566 ; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1567 ; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1568 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v4, v5
1569 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1570 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1571 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v7, v8, v0
1572 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
1573 ; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3]
1574 ; GFX9-NODL-NEXT: s_endpgm
1576 ; GFX9-DL-LABEL: notdot4_mixedtypes:
1577 ; GFX9-DL: ; %bb.0: ; %entry
1578 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1579 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1580 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1581 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
1582 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
1583 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1584 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
1585 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
1586 ; GFX9-DL-NEXT: global_load_ushort v9, v1, s[2:3]
1587 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1588 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
1589 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1590 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3
1591 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
1592 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6
1593 ; GFX9-DL-NEXT: v_bfe_i32 v0, v2, 0, 8
1594 ; GFX9-DL-NEXT: v_bfe_i32 v4, v3, 0, 8
1595 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1596 ; GFX9-DL-NEXT: v_mad_legacy_u16 v5, v5, v6, v9
1597 ; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1598 ; GFX9-DL-NEXT: v_and_b32_sdwa v8, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1599 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v4, v5
1600 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1601 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
1602 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v7, v8, v0
1603 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
1604 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3]
1605 ; GFX9-DL-NEXT: s_endpgm
1607 ; GFX10-DL-LABEL: notdot4_mixedtypes:
1608 ; GFX10-DL: ; %bb.0: ; %entry
1609 ; GFX10-DL-NEXT: s_clause 0x1
1610 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1611 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1612 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1613 ; GFX10-DL-NEXT: s_movk_i32 s0, 0xff
1614 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1615 ; GFX10-DL-NEXT: s_clause 0x1
1616 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1617 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1618 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1619 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
1620 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
1621 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
1622 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1623 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
1624 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
1625 ; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8
1626 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4
1627 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5
1628 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1629 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
1630 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1631 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1632 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
1633 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1634 ; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3
1635 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
1636 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
1637 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
1638 ; GFX10-DL-NEXT: s_endpgm
1639 <4 x i8> addrspace(1)* %src2,
1640 i16 addrspace(1)* nocapture %dst) {
1642 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1643 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1644 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1645 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1646 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1648 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1649 %cv1e0 = sext i8 %v1e0 to i16
1650 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1651 %cv2e0 = sext i8 %v2e0 to i16
1652 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1654 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1655 %cv1e1 = zext i8 %v1e1 to i16
1656 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1657 %cv2e1 = zext i8 %v2e1 to i16
1658 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1660 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1661 %cv1e2 = zext i8 %v1e2 to i16
1662 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1663 %cv2e2 = zext i8 %v2e2 to i16
1664 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1666 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1667 %cv1e3 = zext i8 %v1e3 to i16
1668 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1669 %cv2e3 = zext i8 %v2e3 to i16
1670 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1672 %acc = load i16, i16 addrspace(1)* %dst, align 2
1673 %add1 = add i16 %mul2, %acc
1674 %add2 = add i16 %add1, %mul1
1675 %add3 = add i16 %add2, %mul3
1676 %add4 = add i16 %add3, %mul4
1678 store i16 %add4, i16 addrspace(1)* %dst, align 2
1682 ; TODO: cleanup s_lshr_b32 and support this pattern.
1683 define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
1684 ; GFX7-LABEL: udot4_acc32_vecMul:
1685 ; GFX7: ; %bb.0: ; %entry
1686 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1687 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1688 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1689 ; GFX7-NEXT: s_mov_b32 s10, 0
1690 ; GFX7-NEXT: s_mov_b32 s11, s3
1691 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1692 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1693 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1694 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1695 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1696 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1697 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1698 ; GFX7-NEXT: s_movk_i32 s4, 0xff
1699 ; GFX7-NEXT: s_mov_b32 s2, -1
1700 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1701 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
1702 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
1703 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1704 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
1705 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1706 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0
1707 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
1708 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1709 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
1710 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1711 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1712 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
1713 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
1714 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0
1715 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0
1716 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1717 ; GFX7-NEXT: s_endpgm
1719 ; GFX8-LABEL: udot4_acc32_vecMul:
1720 ; GFX8: ; %bb.0: ; %entry
1721 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1722 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1723 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1724 ; GFX8-NEXT: s_movk_i32 s2, 0xff
1725 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1726 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1727 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1728 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1729 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1730 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1731 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1732 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1733 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1734 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
1735 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1736 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
1737 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8
1738 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3
1739 ; GFX8-NEXT: v_and_b32_e32 v3, s2, v3
1740 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1741 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
1742 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8
1743 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0
1744 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v0
1745 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1746 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s3
1747 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0
1748 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0
1749 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
1750 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1751 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1752 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1753 ; GFX8-NEXT: s_endpgm
1755 ; GFX9-NODL-LABEL: udot4_acc32_vecMul:
1756 ; GFX9-NODL: ; %bb.0: ; %entry
1757 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1758 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1759 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1760 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1761 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1762 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1763 ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
1764 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1765 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1766 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1767 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1768 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1769 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1770 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1771 ; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4
1772 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
1773 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
1774 ; GFX9-NODL-NEXT: s_endpgm
1776 ; GFX9-DL-LABEL: udot4_acc32_vecMul:
1777 ; GFX9-DL: ; %bb.0: ; %entry
1778 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1779 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1780 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1781 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1782 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1783 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1784 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1785 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1786 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1787 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1788 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1789 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1790 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1791 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1792 ; GFX9-DL-NEXT: v_add3_u32 v2, v3, s0, v4
1793 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v5, v1
1794 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1795 ; GFX9-DL-NEXT: s_endpgm
1797 ; GFX10-DL-LABEL: udot4_acc32_vecMul:
1798 ; GFX10-DL: ; %bb.0: ; %entry
1799 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1800 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1801 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1802 ; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff
1803 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1804 ; GFX10-DL-NEXT: s_clause 0x1
1805 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
1806 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
1807 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1808 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1809 ; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1810 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1811 ; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1812 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1813 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3
1814 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1815 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1816 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1817 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1818 ; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0
1819 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
1820 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
1821 ; GFX10-DL-NEXT: s_endpgm
1822 <4 x i8> addrspace(1)* %src2,
1823 i32 addrspace(1)* nocapture %dst) {
1825 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1826 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1827 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1828 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1829 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1831 %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
1832 %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
1834 %mul = mul <4 x i32> %cvec1, %cvec2
1835 %mul0 = extractelement <4 x i32> %mul, i64 0
1836 %mul1 = extractelement <4 x i32> %mul, i64 1
1837 %mul2 = extractelement <4 x i32> %mul, i64 2
1838 %mul3 = extractelement <4 x i32> %mul, i64 3
1840 %acc = load i32, i32 addrspace(1)* %dst, align 4
1841 %add1 = add i32 %mul0, %acc
1842 %add2 = add i32 %add1, %mul1
1843 %add3 = add i32 %add2, %mul2
1844 %add4 = add i32 %add3, %mul3
1846 store i32 %add4, i32 addrspace(1)* %dst, align 4
1850 ; TODO: This pattern should be recognized.
1851 define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
1852 ; GFX7-LABEL: udot4_acc16_vecMul:
1853 ; GFX7: ; %bb.0: ; %entry
1854 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1855 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1856 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1857 ; GFX7-NEXT: s_mov_b32 s10, 0
1858 ; GFX7-NEXT: s_mov_b32 s11, s3
1859 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1861 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1862 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1863 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1864 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1865 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1866 ; GFX7-NEXT: s_mov_b32 s2, -1
1867 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0
1868 ; GFX7-NEXT: s_mov_b32 s4, 0xff00
1869 ; GFX7-NEXT: s_movk_i32 s5, 0xff
1870 ; GFX7-NEXT: s_waitcnt vmcnt(2)
1871 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
1872 ; GFX7-NEXT: v_and_b32_e32 v3, s5, v2
1873 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1874 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1875 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0
1876 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
1877 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5
1878 ; GFX7-NEXT: v_and_b32_e32 v6, s5, v0
1879 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3
1880 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1
1881 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1882 ; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
1883 ; GFX7-NEXT: v_and_b32_e32 v3, s5, v3
1884 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1885 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8
1886 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
1887 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
1888 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v6, v1
1889 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1890 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1891 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
1892 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
1893 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1894 ; GFX7-NEXT: s_endpgm
1896 ; GFX8-LABEL: udot4_acc16_vecMul:
1897 ; GFX8: ; %bb.0: ; %entry
1898 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1899 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1900 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1901 ; GFX8-NEXT: s_movk_i32 s2, 0xff
1902 ; GFX8-NEXT: v_mov_b32_e32 v5, s2
1903 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1904 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1905 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1906 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1907 ; GFX8-NEXT: flat_load_dword v4, v[0:1]
1908 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1909 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1910 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1911 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1912 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1913 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1914 ; GFX8-NEXT: flat_load_ushort v10, v[2:3]
1915 ; GFX8-NEXT: s_waitcnt vmcnt(2)
1916 ; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1917 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v4
1918 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v4
1919 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
1920 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1921 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0
1922 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 8, v0
1923 ; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1924 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v0
1925 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1926 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v10
1927 ; GFX8-NEXT: v_mad_u16 v0, v6, v8, v0
1928 ; GFX8-NEXT: v_mad_u16 v0, v9, v5, v0
1929 ; GFX8-NEXT: v_mad_u16 v0, v1, v7, v0
1930 ; GFX8-NEXT: flat_store_short v[2:3], v0
1931 ; GFX8-NEXT: s_endpgm
1933 ; GFX9-NODL-LABEL: udot4_acc16_vecMul:
1934 ; GFX9-NODL: ; %bb.0: ; %entry
1935 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1936 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1937 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1938 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
1939 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff
1940 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1941 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
1942 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
1943 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1944 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3]
1945 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
1946 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v5, 8, v1
1947 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
1948 ; GFX9-NODL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1949 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 24, v1
1950 ; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1951 ; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1952 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v7, 8, v2
1953 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2
1954 ; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1955 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2
1956 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1
1957 ; GFX9-NODL-NEXT: v_and_b32_e32 v10, v4, v10
1958 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, v4, v9
1959 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1960 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1961 ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3
1962 ; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v8, 16, v10
1963 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1964 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5
1965 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1966 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2
1967 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1968 ; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3]
1969 ; GFX9-NODL-NEXT: s_endpgm
1971 ; GFX9-DL-LABEL: udot4_acc16_vecMul:
1972 ; GFX9-DL: ; %bb.0: ; %entry
1973 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1974 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1975 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1976 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
1977 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff
1978 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1979 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1980 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1981 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1982 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
1983 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1984 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v5, 8, v1
1985 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1986 ; GFX9-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1987 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1
1988 ; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1989 ; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1990 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v7, 8, v2
1991 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2
1992 ; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1993 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2
1994 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1
1995 ; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10
1996 ; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v9
1997 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1998 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1999 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3
2000 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v8, 16, v10
2001 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4
2002 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5
2003 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2004 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2
2005 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2006 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
2007 ; GFX9-DL-NEXT: s_endpgm
2009 ; GFX10-DL-LABEL: udot4_acc16_vecMul:
2010 ; GFX10-DL: ; %bb.0: ; %entry
2011 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2012 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2013 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2014 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
2015 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
2016 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2017 ; GFX10-DL-NEXT: s_clause 0x1
2018 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2019 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2020 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2021 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
2022 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2023 ; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
2024 ; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2025 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2026 ; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2
2027 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2028 ; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2029 ; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2030 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8
2031 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
2032 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2033 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
2034 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v10
2035 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v9
2036 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
2037 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7
2038 ; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4
2039 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5
2040 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2041 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
2042 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2043 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4
2044 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2045 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
2046 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
2047 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
2048 ; GFX10-DL-NEXT: s_endpgm
2049 <4 x i8> addrspace(1)* %src2,
2050 i16 addrspace(1)* nocapture %dst) {
2052 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2053 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
2054 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
2055 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
2056 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
2058 %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
2059 %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
2061 %mul = mul <4 x i16> %cvec1, %cvec2
2062 %mul0 = extractelement <4 x i16> %mul, i64 0
2063 %mul1 = extractelement <4 x i16> %mul, i64 1
2064 %mul2 = extractelement <4 x i16> %mul, i64 2
2065 %mul3 = extractelement <4 x i16> %mul, i64 3
2067 %acc = load i16, i16 addrspace(1)* %dst, align 4
2068 %add1 = add i16 %mul0, %acc
2069 %add2 = add i16 %add1, %mul1
2070 %add3 = add i16 %add2, %mul2
2071 %add4 = add i16 %add3, %mul3
2073 store i16 %add4, i16 addrspace(1)* %dst, align 4
2077 ; TODO: Support this pattern.
2078 define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
2079 ; GFX7-LABEL: udot4_acc8_vecMul:
2080 ; GFX7: ; %bb.0: ; %entry
2081 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2082 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2083 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2084 ; GFX7-NEXT: s_mov_b32 s10, 0
2085 ; GFX7-NEXT: s_mov_b32 s11, s3
2086 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2087 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2088 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2089 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2090 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2091 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2092 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2093 ; GFX7-NEXT: s_mov_b32 s2, -1
2094 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0
2095 ; GFX7-NEXT: s_movk_i32 s4, 0xff
2096 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2097 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v2
2098 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
2099 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2100 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v0
2101 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
2102 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2103 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8
2104 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2
2105 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0
2106 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
2107 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
2108 ; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3
2109 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
2110 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0
2111 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2112 ; GFX7-NEXT: s_endpgm
2114 ; GFX8-LABEL: udot4_acc8_vecMul:
2115 ; GFX8: ; %bb.0: ; %entry
2116 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2117 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2118 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2119 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2120 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2121 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2122 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2123 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2124 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2125 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2126 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2127 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2128 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2129 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2130 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
2131 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2132 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2133 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2134 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
2135 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2136 ; GFX8-NEXT: v_mul_lo_u16_e32 v9, v5, v6
2137 ; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2138 ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2139 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
2140 ; GFX8-NEXT: v_or_b32_e32 v8, v8, v9
2141 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8
2142 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2143 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2144 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
2145 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2146 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2
2147 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v7
2148 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2149 ; GFX8-NEXT: s_endpgm
2151 ; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2152 ; GFX9-NODL: ; %bb.0: ; %entry
2153 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2154 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2155 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2156 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2157 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
2158 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
2159 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2160 ; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3]
2161 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
2162 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
2163 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2164 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2165 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2166 ; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
2167 ; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2168 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2169 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v8, 16, v6
2170 ; GFX9-NODL-NEXT: v_or_b32_e32 v7, v7, v8
2171 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2172 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2173 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
2174 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v7
2175 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
2176 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
2177 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6
2178 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3]
2179 ; GFX9-NODL-NEXT: s_endpgm
2181 ; GFX9-DL-LABEL: udot4_acc8_vecMul:
2182 ; GFX9-DL: ; %bb.0: ; %entry
2183 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2184 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2185 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2186 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2187 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2188 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2189 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2190 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
2191 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2192 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
2193 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2194 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2195 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2196 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
2197 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2198 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2199 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v6
2200 ; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8
2201 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
2202 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2203 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
2204 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7
2205 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
2206 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
2207 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
2208 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
2209 ; GFX9-DL-NEXT: s_endpgm
2211 ; GFX10-DL-LABEL: udot4_acc8_vecMul:
2212 ; GFX10-DL: ; %bb.0: ; %entry
2213 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2214 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2215 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2216 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2217 ; GFX10-DL-NEXT: s_clause 0x1
2218 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
2219 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
2220 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2221 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
2222 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2223 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
2224 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2225 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2
2226 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
2227 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2
2228 ; GFX10-DL-NEXT: v_lshrrev_b16 v8, 8, v2
2229 ; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5
2230 ; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
2231 ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7
2232 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2233 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
2234 ; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4
2235 ; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8
2236 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2237 ; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5
2238 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v4
2239 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4
2240 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2241 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5
2242 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
2243 ; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1
2244 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
2245 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
2246 ; GFX10-DL-NEXT: s_endpgm
2247 <4 x i8> addrspace(1)* %src2,
2248 i8 addrspace(1)* nocapture %dst) {
2250 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2251 %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
2252 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
2253 %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
2254 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
2256 %mul = mul <4 x i8> %vec1, %vec2
2257 %mul0 = extractelement <4 x i8> %mul, i64 0
2258 %mul1 = extractelement <4 x i8> %mul, i64 1
2259 %mul2 = extractelement <4 x i8> %mul, i64 2
2260 %mul3 = extractelement <4 x i8> %mul, i64 3
2262 %acc = load i8, i8 addrspace(1)* %dst, align 4
2263 %add1 = add i8 %mul0, %acc
2264 %add2 = add i8 %add1, %mul1
2265 %add3 = add i8 %add2, %mul2
2266 %add4 = add i8 %add3, %mul3
2268 store i8 %add4, i8 addrspace(1)* %dst, align 4
2272 declare i32 @llvm.amdgcn.workitem.id.x()