1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
7 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
8 ; GFX7-LABEL: idot8_acc32:
9 ; GFX7: ; %bb.0: ; %entry
10 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
11 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
12 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX7-NEXT: s_mov_b32 s6, -1
14 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
16 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0
17 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
18 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19 ; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000
20 ; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000
21 ; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004
22 ; GFX7-NEXT: v_mov_b32_e32 v0, s9
23 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
24 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
25 ; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004
26 ; GFX7-NEXT: v_mov_b32_e32 v1, s11
27 ; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008
28 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0
29 ; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008
30 ; GFX7-NEXT: v_mov_b32_e32 v1, s13
31 ; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c
32 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0
33 ; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c
34 ; GFX7-NEXT: v_mov_b32_e32 v1, s15
35 ; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010
36 ; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0
37 ; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010
38 ; GFX7-NEXT: v_mov_b32_e32 v1, s17
39 ; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014
40 ; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018
41 ; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0
42 ; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014
43 ; GFX7-NEXT: v_mov_b32_e32 v1, s19
44 ; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018
45 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0
46 ; GFX7-NEXT: v_mov_b32_e32 v1, s21
47 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
48 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0
49 ; GFX7-NEXT: s_ashr_i32 s0, s0, 28
50 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
51 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0
52 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
55 ; GFX8-LABEL: idot8_acc32:
56 ; GFX8: ; %bb.0: ; %entry
57 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
58 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
59 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
61 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
62 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
63 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
64 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
65 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
67 ; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
68 ; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004
69 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
70 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
71 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
72 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004
73 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
74 ; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008
75 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2
76 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008
77 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
78 ; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c
79 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2
80 ; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c
81 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
82 ; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010
83 ; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2
84 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010
85 ; GFX8-NEXT: v_mov_b32_e32 v3, s13
86 ; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014
87 ; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018
88 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2
89 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014
90 ; GFX8-NEXT: v_mov_b32_e32 v3, s15
91 ; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018
92 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
93 ; GFX8-NEXT: v_mov_b32_e32 v3, s17
94 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28
95 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2
96 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28
97 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
98 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
99 ; GFX8-NEXT: flat_store_dword v[0:1], v2
100 ; GFX8-NEXT: s_endpgm
102 ; GFX9-LABEL: idot8_acc32:
103 ; GFX9: ; %bb.0: ; %entry
104 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
105 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
106 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
108 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
109 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0
110 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
111 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
112 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
114 ; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
115 ; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004
116 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
117 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
118 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3
119 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004
120 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
121 ; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008
122 ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2
123 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008
124 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
125 ; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c
126 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2
127 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c
128 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
129 ; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010
130 ; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2
131 ; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
132 ; GFX9-NEXT: v_mov_b32_e32 v3, s13
133 ; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014
134 ; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018
135 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2
136 ; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
137 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
138 ; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
139 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2
140 ; GFX9-NEXT: v_mov_b32_e32 v3, s17
141 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28
142 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2
143 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28
144 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
145 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2
146 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
147 ; GFX9-NEXT: s_endpgm
149 ; GFX9-DL-LABEL: idot8_acc32:
150 ; GFX9-DL: ; %bb.0: ; %entry
151 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
152 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
153 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
155 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
156 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
157 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
158 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
159 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
160 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
161 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
162 ; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v2, v3
163 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
164 ; GFX9-DL-NEXT: s_endpgm
165 <8 x i4> addrspace(1)* %src2,
166 i32 addrspace(1)* nocapture %dst) {
168 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
169 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
171 %v1e0 = extractelement <8 x i4> %vec1, i64 0
172 %cv1e0 = sext i4 %v1e0 to i32
173 %v2e0 = extractelement <8 x i4> %vec2, i64 0
174 %cv2e0 = sext i4 %v2e0 to i32
175 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
177 %v1e1 = extractelement <8 x i4> %vec1, i64 1
178 %cv1e1 = sext i4 %v1e1 to i32
179 %v2e1 = extractelement <8 x i4> %vec2, i64 1
180 %cv2e1 = sext i4 %v2e1 to i32
181 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
183 %v1e2 = extractelement <8 x i4> %vec1, i64 2
184 %cv1e2 = sext i4 %v1e2 to i32
185 %v2e2 = extractelement <8 x i4> %vec2, i64 2
186 %cv2e2 = sext i4 %v2e2 to i32
187 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
189 %v1e3 = extractelement <8 x i4> %vec1, i64 3
190 %cv1e3 = sext i4 %v1e3 to i32
191 %v2e3 = extractelement <8 x i4> %vec2, i64 3
192 %cv2e3 = sext i4 %v2e3 to i32
193 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
195 %v1e4 = extractelement <8 x i4> %vec1, i64 4
196 %cv1e4 = sext i4 %v1e4 to i32
197 %v2e4 = extractelement <8 x i4> %vec2, i64 4
198 %cv2e4 = sext i4 %v2e4 to i32
199 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
201 %v1e5 = extractelement <8 x i4> %vec1, i64 5
202 %cv1e5 = sext i4 %v1e5 to i32
203 %v2e5 = extractelement <8 x i4> %vec2, i64 5
204 %cv2e5 = sext i4 %v2e5 to i32
205 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
207 %v1e6 = extractelement <8 x i4> %vec1, i64 6
208 %cv1e6 = sext i4 %v1e6 to i32
209 %v2e6 = extractelement <8 x i4> %vec2, i64 6
210 %cv2e6 = sext i4 %v2e6 to i32
211 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
213 %v1e7 = extractelement <8 x i4> %vec1, i64 7
214 %cv1e7 = sext i4 %v1e7 to i32
215 %v2e7 = extractelement <8 x i4> %vec2, i64 7
216 %cv2e7 = sext i4 %v2e7 to i32
217 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
219 %acc = load i32, i32 addrspace(1)* %dst, align 4
220 %add1 = add i32 %mul0, %acc
221 %add2 = add i32 %add1, %mul1
222 %add3 = add i32 %add2, %mul2
223 %add4 = add i32 %add3, %mul3
224 %add5 = add i32 %add4, %mul4
225 %add6 = add i32 %add5, %mul5
226 %add7 = add i32 %add6, %mul6
227 %add8 = add i32 %add7, %mul7
229 store i32 %add8, i32 addrspace(1)* %dst, align 4
233 ; TODO: Once the unnecessary zero extentions of the elements are removed;
234 ; pattern recognizer will kick in.
235 define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
236 ; GFX7-LABEL: idot8_acc16:
237 ; GFX7: ; %bb.0: ; %entry
238 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
239 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
240 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
241 ; GFX7-NEXT: s_mov_b32 s6, -1
242 ; GFX7-NEXT: s_mov_b32 s0, 0xffff
243 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
245 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
246 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0
247 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000
249 ; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004
250 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000
251 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004
252 ; GFX7-NEXT: s_and_b32 s9, s9, s0
253 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008
254 ; GFX7-NEXT: s_and_b32 s11, s11, s0
255 ; GFX7-NEXT: s_and_b32 s8, s8, s0
256 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
257 ; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008
258 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c
259 ; GFX7-NEXT: s_and_b32 s13, s13, s0
260 ; GFX7-NEXT: s_and_b32 s10, s10, s0
261 ; GFX7-NEXT: v_mov_b32_e32 v2, s11
262 ; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c
263 ; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010
264 ; GFX7-NEXT: s_and_b32 s15, s15, s0
265 ; GFX7-NEXT: s_and_b32 s12, s12, s0
266 ; GFX7-NEXT: v_mov_b32_e32 v3, s13
267 ; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010
268 ; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014
269 ; GFX7-NEXT: s_and_b32 s17, s17, s0
270 ; GFX7-NEXT: s_and_b32 s14, s14, s0
271 ; GFX7-NEXT: v_mov_b32_e32 v4, s15
272 ; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018
273 ; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014
274 ; GFX7-NEXT: s_and_b32 s19, s19, s0
275 ; GFX7-NEXT: s_and_b32 s16, s16, s0
276 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
277 ; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018
278 ; GFX7-NEXT: s_ashr_i32 s2, s2, 28
279 ; GFX7-NEXT: s_and_b32 s21, s21, s0
280 ; GFX7-NEXT: s_and_b32 s18, s18, s0
281 ; GFX7-NEXT: v_mov_b32_e32 v6, s19
282 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
283 ; GFX7-NEXT: s_and_b32 s20, s20, s0
284 ; GFX7-NEXT: s_and_b32 s2, s2, s0
285 ; GFX7-NEXT: v_mov_b32_e32 v7, s21
286 ; GFX7-NEXT: s_and_b32 s0, s1, s0
287 ; GFX7-NEXT: s_waitcnt vmcnt(0)
288 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
289 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0
290 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0
291 ; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0
292 ; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0
293 ; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0
294 ; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0
295 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
296 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0
297 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
298 ; GFX7-NEXT: s_endpgm
300 ; GFX8-LABEL: idot8_acc16:
301 ; GFX8: ; %bb.0: ; %entry
302 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
303 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
304 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
306 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
307 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
308 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
309 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
310 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
311 ; GFX8-NEXT: s_lshr_b32 s0, s2, 4
312 ; GFX8-NEXT: s_lshr_b32 s1, s4, 4
313 ; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40000
314 ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0
315 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1
316 ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x40008
317 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
318 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000
319 ; GFX8-NEXT: s_lshr_b32 s1, s2, 12
320 ; GFX8-NEXT: s_lshr_b32 s5, s4, 12
321 ; GFX8-NEXT: v_mov_b32_e32 v6, s0
322 ; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40008
323 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
324 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
325 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s1
326 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
327 ; GFX8-NEXT: v_mul_i32_i24_e32 v6, s7, v6
328 ; GFX8-NEXT: s_lshr_b32 s0, s2, 20
329 ; GFX8-NEXT: s_lshr_b32 s1, s4, 20
330 ; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40010
331 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
332 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
333 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0
334 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1
335 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40010
336 ; GFX8-NEXT: v_mov_b32_e32 v13, s5
337 ; GFX8-NEXT: s_lshr_b32 s0, s2, 28
338 ; GFX8-NEXT: s_lshr_b32 s9, s4, 28
339 ; GFX8-NEXT: s_bfe_i32 s4, s4, 0x40018
340 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
341 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
342 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s0
343 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s9
344 ; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40018
345 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
346 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
347 ; GFX8-NEXT: s_waitcnt vmcnt(0)
348 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
349 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
350 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
351 ; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2
352 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v13, v2
353 ; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2
354 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
355 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
356 ; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2
357 ; GFX8-NEXT: flat_store_short v[0:1], v2
358 ; GFX8-NEXT: s_endpgm
360 ; GFX9-LABEL: idot8_acc16:
361 ; GFX9: ; %bb.0: ; %entry
362 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
363 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
364 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
366 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
367 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
368 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
369 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off
370 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX9-NEXT: s_lshr_b32 s0, s2, 4
372 ; GFX9-NEXT: s_lshr_b32 s1, s4, 4
373 ; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40000
374 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0
375 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
376 ; GFX9-NEXT: s_bfe_i32 s0, s4, 0x40008
377 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
378 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
379 ; GFX9-NEXT: s_lshr_b32 s1, s2, 12
380 ; GFX9-NEXT: s_lshr_b32 s5, s4, 12
381 ; GFX9-NEXT: v_mov_b32_e32 v6, s0
382 ; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40008
383 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
384 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
385 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s1
386 ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5
387 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, s7, v6
388 ; GFX9-NEXT: s_lshr_b32 s0, s2, 20
389 ; GFX9-NEXT: s_lshr_b32 s1, s4, 20
390 ; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40010
391 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
392 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
393 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s0
394 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s1
395 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40010
396 ; GFX9-NEXT: v_mov_b32_e32 v13, s5
397 ; GFX9-NEXT: s_lshr_b32 s0, s2, 28
398 ; GFX9-NEXT: s_lshr_b32 s9, s4, 28
399 ; GFX9-NEXT: s_bfe_i32 s4, s4, 0x40018
400 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
401 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
402 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s0
403 ; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s9
404 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40018
405 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
406 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
407 ; GFX9-NEXT: s_waitcnt vmcnt(0)
408 ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v5, v2
409 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2
410 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
411 ; GFX9-NEXT: v_mad_u32_u24 v2, v7, v8, v2
412 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v13, v2
413 ; GFX9-NEXT: v_mad_u32_u24 v2, v9, v10, v2
414 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
415 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2
416 ; GFX9-NEXT: v_mad_u32_u24 v2, v11, v12, v2
417 ; GFX9-NEXT: global_store_short v[0:1], v2, off
418 ; GFX9-NEXT: s_endpgm
420 ; GFX9-DL-LABEL: idot8_acc16:
421 ; GFX9-DL: ; %bb.0: ; %entry
422 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
423 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
424 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
425 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
426 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
427 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
428 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
429 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
430 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4
432 ; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 4
433 ; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40000
434 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
435 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
436 ; GFX9-DL-NEXT: s_bfe_i32 s0, s4, 0x40008
437 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
438 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
439 ; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12
440 ; GFX9-DL-NEXT: s_lshr_b32 s5, s4, 12
441 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0
442 ; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40008
443 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
444 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
445 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s1
446 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
447 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, s7, v6
448 ; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 20
449 ; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 20
450 ; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40010
451 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
452 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
453 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s0
454 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s1
455 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40010
456 ; GFX9-DL-NEXT: v_mov_b32_e32 v13, s5
457 ; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28
458 ; GFX9-DL-NEXT: s_lshr_b32 s9, s4, 28
459 ; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x40018
460 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
461 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
462 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0
463 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s9
464 ; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x40018
465 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
466 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
467 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
468 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
469 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
470 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
471 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v7, v8, v2
472 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v13, v2
473 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v9, v10, v2
474 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
475 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
476 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v11, v12, v2
477 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
478 ; GFX9-DL-NEXT: s_endpgm
479 <8 x i4> addrspace(1)* %src2,
480 i16 addrspace(1)* nocapture %dst) {
482 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
483 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
485 %v1e0 = extractelement <8 x i4> %vec1, i64 0
486 %cv1e0 = sext i4 %v1e0 to i16
487 %v2e0 = extractelement <8 x i4> %vec2, i64 0
488 %cv2e0 = sext i4 %v2e0 to i16
489 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
491 %v1e1 = extractelement <8 x i4> %vec1, i64 1
492 %cv1e1 = sext i4 %v1e1 to i16
493 %v2e1 = extractelement <8 x i4> %vec2, i64 1
494 %cv2e1 = sext i4 %v2e1 to i16
495 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
497 %v1e2 = extractelement <8 x i4> %vec1, i64 2
498 %cv1e2 = sext i4 %v1e2 to i16
499 %v2e2 = extractelement <8 x i4> %vec2, i64 2
500 %cv2e2 = sext i4 %v2e2 to i16
501 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
503 %v1e3 = extractelement <8 x i4> %vec1, i64 3
504 %cv1e3 = sext i4 %v1e3 to i16
505 %v2e3 = extractelement <8 x i4> %vec2, i64 3
506 %cv2e3 = sext i4 %v2e3 to i16
507 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
509 %v1e4 = extractelement <8 x i4> %vec1, i64 4
510 %cv1e4 = sext i4 %v1e4 to i16
511 %v2e4 = extractelement <8 x i4> %vec2, i64 4
512 %cv2e4 = sext i4 %v2e4 to i16
513 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
515 %v1e5 = extractelement <8 x i4> %vec1, i64 5
516 %cv1e5 = sext i4 %v1e5 to i16
517 %v2e5 = extractelement <8 x i4> %vec2, i64 5
518 %cv2e5 = sext i4 %v2e5 to i16
519 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
521 %v1e6 = extractelement <8 x i4> %vec1, i64 6
522 %cv1e6 = sext i4 %v1e6 to i16
523 %v2e6 = extractelement <8 x i4> %vec2, i64 6
524 %cv2e6 = sext i4 %v2e6 to i16
525 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
527 %v1e7 = extractelement <8 x i4> %vec1, i64 7
528 %cv1e7 = sext i4 %v1e7 to i16
529 %v2e7 = extractelement <8 x i4> %vec2, i64 7
530 %cv2e7 = sext i4 %v2e7 to i16
531 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
533 %acc = load i16, i16 addrspace(1)* %dst, align 4
534 %add1 = add i16 %mul0, %acc
535 %add2 = add i16 %add1, %mul1
536 %add3 = add i16 %add2, %mul2
537 %add4 = add i16 %add3, %mul3
538 %add5 = add i16 %add4, %mul4
539 %add6 = add i16 %add5, %mul5
540 %add7 = add i16 %add6, %mul6
541 %add8 = add i16 %add7, %mul7
543 store i16 %add8, i16 addrspace(1)* %dst, align 4
547 ; TODO: Support this pattern.
548 define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
549 ; GFX7-LABEL: idot8_acc8:
550 ; GFX7: ; %bb.0: ; %entry
551 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
552 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
553 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
554 ; GFX7-NEXT: s_mov_b32 s6, -1
555 ; GFX7-NEXT: s_movk_i32 s0, 0xff
556 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
558 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
559 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0
560 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000
562 ; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004
563 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000
564 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004
565 ; GFX7-NEXT: s_and_b32 s9, s9, s0
566 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008
567 ; GFX7-NEXT: s_and_b32 s11, s11, s0
568 ; GFX7-NEXT: s_and_b32 s8, s8, s0
569 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
570 ; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008
571 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c
572 ; GFX7-NEXT: s_and_b32 s13, s13, s0
573 ; GFX7-NEXT: s_and_b32 s10, s10, s0
574 ; GFX7-NEXT: v_mov_b32_e32 v2, s11
575 ; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c
576 ; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010
577 ; GFX7-NEXT: s_and_b32 s15, s15, s0
578 ; GFX7-NEXT: s_and_b32 s12, s12, s0
579 ; GFX7-NEXT: v_mov_b32_e32 v3, s13
580 ; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010
581 ; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014
582 ; GFX7-NEXT: s_and_b32 s17, s17, s0
583 ; GFX7-NEXT: s_and_b32 s14, s14, s0
584 ; GFX7-NEXT: v_mov_b32_e32 v4, s15
585 ; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018
586 ; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014
587 ; GFX7-NEXT: s_and_b32 s19, s19, s0
588 ; GFX7-NEXT: s_and_b32 s16, s16, s0
589 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
590 ; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018
591 ; GFX7-NEXT: s_ashr_i32 s2, s2, 28
592 ; GFX7-NEXT: s_and_b32 s21, s21, s0
593 ; GFX7-NEXT: s_and_b32 s18, s18, s0
594 ; GFX7-NEXT: v_mov_b32_e32 v6, s19
595 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
596 ; GFX7-NEXT: s_and_b32 s20, s20, s0
597 ; GFX7-NEXT: s_and_b32 s2, s2, s0
598 ; GFX7-NEXT: v_mov_b32_e32 v7, s21
599 ; GFX7-NEXT: s_and_b32 s0, s1, s0
600 ; GFX7-NEXT: s_waitcnt vmcnt(0)
601 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
602 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0
603 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0
604 ; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0
605 ; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0
606 ; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0
607 ; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0
608 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
609 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0
610 ; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
611 ; GFX7-NEXT: s_endpgm
613 ; GFX8-LABEL: idot8_acc8:
614 ; GFX8: ; %bb.0: ; %entry
615 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
616 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
617 ; GFX8-NEXT: s_movk_i32 s2, 0xff
618 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
619 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
620 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
621 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
622 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
623 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
624 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
625 ; GFX8-NEXT: s_lshr_b32 s7, s0, 4
626 ; GFX8-NEXT: s_lshr_b32 s11, s1, 4
627 ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s7
628 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s11
629 ; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40000
630 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
631 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
632 ; GFX8-NEXT: s_lshr_b32 s6, s0, 12
633 ; GFX8-NEXT: s_lshr_b32 s10, s1, 12
634 ; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40008
635 ; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40000
636 ; GFX8-NEXT: v_mov_b32_e32 v12, s13
637 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s6
638 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s10
639 ; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40008
640 ; GFX8-NEXT: v_mov_b32_e32 v5, s15
641 ; GFX8-NEXT: v_and_b32_e32 v3, s2, v3
642 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
643 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
644 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
645 ; GFX8-NEXT: s_lshr_b32 s5, s0, 20
646 ; GFX8-NEXT: s_lshr_b32 s9, s1, 20
647 ; GFX8-NEXT: v_mul_i32_i24_e32 v5, s14, v5
648 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
649 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s9
650 ; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40010
651 ; GFX8-NEXT: v_and_b32_e32 v6, s2, v6
652 ; GFX8-NEXT: v_and_b32_e32 v7, s2, v7
653 ; GFX8-NEXT: s_lshr_b32 s8, s1, 28
654 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
655 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
656 ; GFX8-NEXT: s_lshr_b32 s4, s0, 28
657 ; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40010
658 ; GFX8-NEXT: v_mov_b32_e32 v13, s17
659 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4
660 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s8
661 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40018
662 ; GFX8-NEXT: v_and_b32_e32 v8, s2, v8
663 ; GFX8-NEXT: v_and_b32_e32 v9, s2, v9
664 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
665 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
666 ; GFX8-NEXT: s_bfe_i32 s0, s0, 0x40018
667 ; GFX8-NEXT: v_and_b32_e32 v10, s2, v10
668 ; GFX8-NEXT: v_and_b32_e32 v11, s2, v11
669 ; GFX8-NEXT: s_waitcnt vmcnt(0)
670 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v12, v2
671 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
672 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
673 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
674 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v13, v2
675 ; GFX8-NEXT: v_mad_u32_u24 v2, v8, v9, v2
676 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
677 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
678 ; GFX8-NEXT: v_mad_u32_u24 v2, v10, v11, v2
679 ; GFX8-NEXT: flat_store_byte v[0:1], v2
680 ; GFX8-NEXT: s_endpgm
682 ; GFX9-LABEL: idot8_acc8:
683 ; GFX9: ; %bb.0: ; %entry
684 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
685 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
686 ; GFX9-NEXT: s_movk_i32 s2, 0xff
687 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
688 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
689 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
690 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
691 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
692 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
693 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
694 ; GFX9-NEXT: s_lshr_b32 s7, s0, 4
695 ; GFX9-NEXT: s_lshr_b32 s11, s1, 4
696 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s7
697 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11
698 ; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40000
699 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
700 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
701 ; GFX9-NEXT: s_lshr_b32 s6, s0, 12
702 ; GFX9-NEXT: s_lshr_b32 s10, s1, 12
703 ; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40008
704 ; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40000
705 ; GFX9-NEXT: v_mov_b32_e32 v12, s13
706 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s6
707 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s10
708 ; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40008
709 ; GFX9-NEXT: v_mov_b32_e32 v5, s15
710 ; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
711 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
712 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
713 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
714 ; GFX9-NEXT: s_lshr_b32 s5, s0, 20
715 ; GFX9-NEXT: s_lshr_b32 s9, s1, 20
716 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, s14, v5
717 ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5
718 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s9
719 ; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40010
720 ; GFX9-NEXT: v_and_b32_e32 v6, s2, v6
721 ; GFX9-NEXT: v_and_b32_e32 v7, s2, v7
722 ; GFX9-NEXT: s_lshr_b32 s8, s1, 28
723 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
724 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
725 ; GFX9-NEXT: s_lshr_b32 s4, s0, 28
726 ; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40010
727 ; GFX9-NEXT: v_mov_b32_e32 v13, s17
728 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
729 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s8
730 ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40018
731 ; GFX9-NEXT: v_and_b32_e32 v8, s2, v8
732 ; GFX9-NEXT: v_and_b32_e32 v9, s2, v9
733 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
734 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
735 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40018
736 ; GFX9-NEXT: v_and_b32_e32 v10, s2, v10
737 ; GFX9-NEXT: v_and_b32_e32 v11, s2, v11
738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
739 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v12, v2
740 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2
741 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
742 ; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2
743 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v13, v2
744 ; GFX9-NEXT: v_mad_u32_u24 v2, v8, v9, v2
745 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
746 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
747 ; GFX9-NEXT: v_mad_u32_u24 v2, v10, v11, v2
748 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
749 ; GFX9-NEXT: s_endpgm
751 ; GFX9-DL-LABEL: idot8_acc8:
752 ; GFX9-DL: ; %bb.0: ; %entry
753 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
754 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
755 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
756 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
757 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
758 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
759 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
760 ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
761 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
762 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
763 ; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4
764 ; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 4
765 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s7
766 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
767 ; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40000
768 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
769 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
770 ; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 12
771 ; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 12
772 ; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40008
773 ; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40000
774 ; GFX9-DL-NEXT: v_mov_b32_e32 v12, s13
775 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s6
776 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10
777 ; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40008
778 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
779 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
780 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
781 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
782 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
783 ; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 20
784 ; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 20
785 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, s14, v5
786 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
787 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s9
788 ; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40010
789 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s2, v6
790 ; GFX9-DL-NEXT: v_and_b32_e32 v7, s2, v7
791 ; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 28
792 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
793 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
794 ; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28
795 ; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40010
796 ; GFX9-DL-NEXT: v_mov_b32_e32 v13, s17
797 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
798 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s8
799 ; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x40018
800 ; GFX9-DL-NEXT: v_and_b32_e32 v8, s2, v8
801 ; GFX9-DL-NEXT: v_and_b32_e32 v9, s2, v9
802 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
803 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
804 ; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x40018
805 ; GFX9-DL-NEXT: v_and_b32_e32 v10, s2, v10
806 ; GFX9-DL-NEXT: v_and_b32_e32 v11, s2, v11
807 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
808 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v12, v2
809 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
810 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
811 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2
812 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v13, v2
813 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v8, v9, v2
814 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
815 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
816 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v10, v11, v2
817 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
818 ; GFX9-DL-NEXT: s_endpgm
819 <8 x i4> addrspace(1)* %src2,
820 i8 addrspace(1)* nocapture %dst) {
822 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
823 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
825 %v1e0 = extractelement <8 x i4> %vec1, i64 0
826 %cv1e0 = sext i4 %v1e0 to i8
827 %v2e0 = extractelement <8 x i4> %vec2, i64 0
828 %cv2e0 = sext i4 %v2e0 to i8
829 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
831 %v1e1 = extractelement <8 x i4> %vec1, i64 1
832 %cv1e1 = sext i4 %v1e1 to i8
833 %v2e1 = extractelement <8 x i4> %vec2, i64 1
834 %cv2e1 = sext i4 %v2e1 to i8
835 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
837 %v1e2 = extractelement <8 x i4> %vec1, i64 2
838 %cv1e2 = sext i4 %v1e2 to i8
839 %v2e2 = extractelement <8 x i4> %vec2, i64 2
840 %cv2e2 = sext i4 %v2e2 to i8
841 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
843 %v1e3 = extractelement <8 x i4> %vec1, i64 3
844 %cv1e3 = sext i4 %v1e3 to i8
845 %v2e3 = extractelement <8 x i4> %vec2, i64 3
846 %cv2e3 = sext i4 %v2e3 to i8
847 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
849 %v1e4 = extractelement <8 x i4> %vec1, i64 4
850 %cv1e4 = sext i4 %v1e4 to i8
851 %v2e4 = extractelement <8 x i4> %vec2, i64 4
852 %cv2e4 = sext i4 %v2e4 to i8
853 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
855 %v1e5 = extractelement <8 x i4> %vec1, i64 5
856 %cv1e5 = sext i4 %v1e5 to i8
857 %v2e5 = extractelement <8 x i4> %vec2, i64 5
858 %cv2e5 = sext i4 %v2e5 to i8
859 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
861 %v1e6 = extractelement <8 x i4> %vec1, i64 6
862 %cv1e6 = sext i4 %v1e6 to i8
863 %v2e6 = extractelement <8 x i4> %vec2, i64 6
864 %cv2e6 = sext i4 %v2e6 to i8
865 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
867 %v1e7 = extractelement <8 x i4> %vec1, i64 7
868 %cv1e7 = sext i4 %v1e7 to i8
869 %v2e7 = extractelement <8 x i4> %vec2, i64 7
870 %cv2e7 = sext i4 %v2e7 to i8
871 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
873 %acc = load i8, i8 addrspace(1)* %dst, align 4
874 %add1 = add i8 %mul0, %acc
875 %add2 = add i8 %add1, %mul1
876 %add3 = add i8 %add2, %mul2
877 %add4 = add i8 %add3, %mul3
878 %add5 = add i8 %add4, %mul4
879 %add6 = add i8 %add5, %mul5
880 %add7 = add i8 %add6, %mul6
881 %add8 = add i8 %add7, %mul7
883 store i8 %add8, i8 addrspace(1)* %dst, align 4
887 ; Make sure the pattern is not recognized if there are multiple uses of the
888 ; intermediate multiplications.
889 define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
890 ; GFX7-LABEL: idot8_multiuses_mul1:
891 ; GFX7: ; %bb.0: ; %entry
892 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
893 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
894 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
895 ; GFX7-NEXT: s_mov_b32 s6, -1
896 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
897 ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
898 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0
899 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
900 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
901 ; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000
902 ; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000
903 ; GFX7-NEXT: v_mov_b32_e32 v0, s9
904 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
905 ; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1
906 ; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004
907 ; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004
908 ; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008
909 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
910 ; GFX7-NEXT: v_mov_b32_e32 v2, s11
911 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v2, v0
912 ; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008
913 ; GFX7-NEXT: v_mov_b32_e32 v2, s13
914 ; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c
915 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v2, v0
916 ; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c
917 ; GFX7-NEXT: v_mov_b32_e32 v2, s15
918 ; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010
919 ; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0
920 ; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010
921 ; GFX7-NEXT: v_mov_b32_e32 v2, s17
922 ; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014
923 ; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018
924 ; GFX7-NEXT: v_mad_i32_i24 v0, s16, v2, v0
925 ; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014
926 ; GFX7-NEXT: v_mov_b32_e32 v2, s19
927 ; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018
928 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v2, v0
929 ; GFX7-NEXT: v_mov_b32_e32 v2, s21
930 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
931 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v2, v0
932 ; GFX7-NEXT: s_ashr_i32 s0, s0, 28
933 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
934 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v2, v0
935 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
936 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
937 ; GFX7-NEXT: s_endpgm
939 ; GFX8-LABEL: idot8_multiuses_mul1:
940 ; GFX8: ; %bb.0: ; %entry
941 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
942 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
943 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
944 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
945 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
946 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
947 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
948 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
949 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
951 ; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
952 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
953 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
954 ; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3
955 ; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004
956 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004
957 ; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008
958 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
959 ; GFX8-NEXT: v_mov_b32_e32 v4, s7
960 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v4, v2
961 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008
962 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
963 ; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c
964 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v4, v2
965 ; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c
966 ; GFX8-NEXT: v_mov_b32_e32 v4, s11
967 ; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010
968 ; GFX8-NEXT: v_mad_i32_i24 v2, s10, v4, v2
969 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010
970 ; GFX8-NEXT: v_mov_b32_e32 v4, s13
971 ; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014
972 ; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018
973 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v4, v2
974 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014
975 ; GFX8-NEXT: v_mov_b32_e32 v4, s15
976 ; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018
977 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2
978 ; GFX8-NEXT: v_mov_b32_e32 v4, s17
979 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28
980 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2
981 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28
982 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
983 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2
984 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
985 ; GFX8-NEXT: flat_store_dword v[0:1], v2
986 ; GFX8-NEXT: s_endpgm
988 ; GFX9-LABEL: idot8_multiuses_mul1:
989 ; GFX9: ; %bb.0: ; %entry
990 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
991 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
992 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
993 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
994 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
995 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0
996 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
997 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
998 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
999 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
1000 ; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
1001 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1002 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
1003 ; GFX9-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1004 ; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004
1005 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004
1006 ; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008
1007 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1008 ; GFX9-NEXT: v_mov_b32_e32 v4, s7
1009 ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v4, v2
1010 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008
1011 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1012 ; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c
1013 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v4, v2
1014 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c
1015 ; GFX9-NEXT: v_mov_b32_e32 v4, s11
1016 ; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010
1017 ; GFX9-NEXT: v_mad_i32_i24 v2, s10, v4, v2
1018 ; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
1019 ; GFX9-NEXT: v_mov_b32_e32 v4, s13
1020 ; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014
1021 ; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018
1022 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v4, v2
1023 ; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
1024 ; GFX9-NEXT: v_mov_b32_e32 v4, s15
1025 ; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
1026 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v4, v2
1027 ; GFX9-NEXT: v_mov_b32_e32 v4, s17
1028 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28
1029 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v4, v2
1030 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28
1031 ; GFX9-NEXT: v_mov_b32_e32 v4, s4
1032 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v4, v2
1033 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
1034 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1035 ; GFX9-NEXT: s_endpgm
1037 ; GFX9-DL-LABEL: idot8_multiuses_mul1:
1038 ; GFX9-DL: ; %bb.0: ; %entry
1039 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1040 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1041 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1042 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1043 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1044 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1045 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1046 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1047 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1048 ; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
1049 ; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
1050 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1
1051 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
1052 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1053 ; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40004
1054 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004
1055 ; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40008
1056 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1057 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7
1058 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v4, v2
1059 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008
1060 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9
1061 ; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x4000c
1062 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v4, v2
1063 ; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c
1064 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11
1065 ; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x40010
1066 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v4, v2
1067 ; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
1068 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13
1069 ; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40014
1070 ; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40018
1071 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v4, v2
1072 ; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
1073 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
1074 ; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
1075 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v4, v2
1076 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17
1077 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28
1078 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v4, v2
1079 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
1080 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
1081 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
1082 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
1083 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1084 ; GFX9-DL-NEXT: s_endpgm
1085 <8 x i4> addrspace(1)* %src2,
1086 i32 addrspace(1)* nocapture %dst) {
1088 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1089 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1091 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1092 %cv1e0 = sext i4 %v1e0 to i32
1093 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1094 %cv2e0 = sext i4 %v2e0 to i32
1095 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1097 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1098 %cv1e1 = sext i4 %v1e1 to i32
1099 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1100 %cv2e1 = sext i4 %v2e1 to i32
1101 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1103 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1104 %cv1e2 = sext i4 %v1e2 to i32
1105 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1106 %cv2e2 = sext i4 %v2e2 to i32
1107 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1109 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1110 %cv1e3 = sext i4 %v1e3 to i32
1111 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1112 %cv2e3 = sext i4 %v2e3 to i32
1113 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1115 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1116 %cv1e4 = sext i4 %v1e4 to i32
1117 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1118 %cv2e4 = sext i4 %v2e4 to i32
1119 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1121 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1122 %cv1e5 = sext i4 %v1e5 to i32
1123 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1124 %cv2e5 = sext i4 %v2e5 to i32
1125 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1127 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1128 %cv1e6 = sext i4 %v1e6 to i32
1129 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1130 %cv2e6 = sext i4 %v2e6 to i32
1131 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1133 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1134 %cv1e7 = sext i4 %v1e7 to i32
1135 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1136 %cv2e7 = sext i4 %v2e7 to i32
1137 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1139 %acc = load i32, i32 addrspace(1)* %dst, align 4
1140 %add = add i32 %mul0, %acc
1141 %add1 = add i32 %mul0, %add
1142 %add2 = add i32 %add1, %mul1
1143 %add3 = add i32 %add2, %mul2
1144 %add4 = add i32 %add3, %mul3
1145 %add5 = add i32 %add4, %mul4
1146 %add6 = add i32 %add5, %mul5
1147 %add7 = add i32 %add6, %mul6
1148 %add8 = add i32 %add7, %mul7
1150 %res = add i32 %add, %add8
1151 store i32 %res, i32 addrspace(1)* %dst, align 4
1155 ; TODO: Support this pattern.
1156 define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1157 ; GFX7-LABEL: idot8_acc32_vecMul:
1158 ; GFX7: ; %bb.0: ; %entry
1159 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1160 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1161 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1162 ; GFX7-NEXT: s_mov_b32 s6, -1
1163 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1164 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
1165 ; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0
1166 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
1167 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1168 ; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60
1169 ; GFX7-NEXT: s_lshl_b32 s11, s1, 4
1170 ; GFX7-NEXT: s_ashr_i64 s[14:15], s[10:11], 60
1171 ; GFX7-NEXT: s_lshl_b32 s11, s1, 12
1172 ; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60
1173 ; GFX7-NEXT: s_lshl_b32 s11, s1, 16
1174 ; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60
1175 ; GFX7-NEXT: s_lshl_b32 s11, s1, 20
1176 ; GFX7-NEXT: s_lshl_b32 s13, s1, 8
1177 ; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60
1178 ; GFX7-NEXT: s_lshl_b32 s11, s1, 24
1179 ; GFX7-NEXT: s_lshl_b32 s1, s1, 28
1180 ; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1181 ; GFX7-NEXT: s_lshl_b32 s1, s9, 4
1182 ; GFX7-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1183 ; GFX7-NEXT: s_lshl_b32 s1, s9, 8
1184 ; GFX7-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1185 ; GFX7-NEXT: s_lshl_b32 s1, s9, 12
1186 ; GFX7-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1187 ; GFX7-NEXT: s_lshl_b32 s1, s9, 16
1188 ; GFX7-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1189 ; GFX7-NEXT: s_lshl_b32 s1, s9, 20
1190 ; GFX7-NEXT: s_ashr_i64 s[34:35], s[0:1], 60
1191 ; GFX7-NEXT: s_lshl_b32 s1, s9, 24
1192 ; GFX7-NEXT: s_ashr_i64 s[36:37], s[0:1], 60
1193 ; GFX7-NEXT: s_lshl_b32 s1, s9, 28
1194 ; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60
1195 ; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60
1196 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
1197 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
1198 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1
1199 ; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60
1200 ; GFX7-NEXT: v_mov_b32_e32 v1, s36
1201 ; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0
1202 ; GFX7-NEXT: v_mov_b32_e32 v1, s34
1203 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0
1204 ; GFX7-NEXT: v_mov_b32_e32 v1, s32
1205 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0
1206 ; GFX7-NEXT: v_mov_b32_e32 v1, s30
1207 ; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0
1208 ; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
1209 ; GFX7-NEXT: v_mov_b32_e32 v1, s28
1210 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0
1211 ; GFX7-NEXT: v_mov_b32_e32 v1, s26
1212 ; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0
1213 ; GFX7-NEXT: v_mov_b32_e32 v1, s24
1214 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0
1215 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1216 ; GFX7-NEXT: s_endpgm
1218 ; GFX8-LABEL: idot8_acc32_vecMul:
1219 ; GFX8: ; %bb.0: ; %entry
1220 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1221 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1222 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1223 ; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0
1224 ; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0
1225 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1226 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1227 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1228 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1229 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1230 ; GFX8-NEXT: s_lshl_b32 s1, s5, 4
1231 ; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1232 ; GFX8-NEXT: s_lshl_b32 s1, s5, 16
1233 ; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60
1234 ; GFX8-NEXT: s_lshl_b32 s1, s5, 20
1235 ; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60
1236 ; GFX8-NEXT: s_lshl_b32 s1, s5, 24
1237 ; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60
1238 ; GFX8-NEXT: s_lshl_b32 s1, s5, 28
1239 ; GFX8-NEXT: s_lshl_b32 s9, s5, 8
1240 ; GFX8-NEXT: s_lshl_b32 s11, s5, 12
1241 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
1242 ; GFX8-NEXT: s_lshl_b32 s1, s7, 4
1243 ; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60
1244 ; GFX8-NEXT: s_lshl_b32 s1, s7, 8
1245 ; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60
1246 ; GFX8-NEXT: s_lshl_b32 s1, s7, 12
1247 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1248 ; GFX8-NEXT: s_lshl_b32 s1, s7, 16
1249 ; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1250 ; GFX8-NEXT: s_lshl_b32 s1, s7, 20
1251 ; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1252 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24
1253 ; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1254 ; GFX8-NEXT: s_lshl_b32 s1, s7, 28
1255 ; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60
1256 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60
1257 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
1258 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
1259 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3
1260 ; GFX8-NEXT: v_mov_b32_e32 v3, s32
1261 ; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2
1262 ; GFX8-NEXT: v_mov_b32_e32 v3, s30
1263 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2
1264 ; GFX8-NEXT: v_mov_b32_e32 v3, s28
1265 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
1266 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1267 ; GFX8-NEXT: v_mov_b32_e32 v3, s26
1268 ; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2
1269 ; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1270 ; GFX8-NEXT: v_mov_b32_e32 v3, s24
1271 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2
1272 ; GFX8-NEXT: v_mov_b32_e32 v3, s22
1273 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2
1274 ; GFX8-NEXT: v_mov_b32_e32 v3, s20
1275 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
1276 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1277 ; GFX8-NEXT: s_endpgm
1279 ; GFX9-LABEL: idot8_acc32_vecMul:
1280 ; GFX9: ; %bb.0: ; %entry
1281 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1282 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1284 ; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0
1285 ; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0
1286 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
1287 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1288 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1290 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1291 ; GFX9-NEXT: s_lshl_b32 s1, s5, 4
1292 ; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1293 ; GFX9-NEXT: s_lshl_b32 s1, s5, 16
1294 ; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60
1295 ; GFX9-NEXT: s_lshl_b32 s1, s5, 20
1296 ; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60
1297 ; GFX9-NEXT: s_lshl_b32 s1, s5, 24
1298 ; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60
1299 ; GFX9-NEXT: s_lshl_b32 s1, s5, 28
1300 ; GFX9-NEXT: s_lshl_b32 s9, s5, 8
1301 ; GFX9-NEXT: s_lshl_b32 s11, s5, 12
1302 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
1303 ; GFX9-NEXT: s_lshl_b32 s1, s7, 4
1304 ; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60
1305 ; GFX9-NEXT: s_lshl_b32 s1, s7, 8
1306 ; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60
1307 ; GFX9-NEXT: s_lshl_b32 s1, s7, 12
1308 ; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1309 ; GFX9-NEXT: s_lshl_b32 s1, s7, 16
1310 ; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1311 ; GFX9-NEXT: s_lshl_b32 s1, s7, 20
1312 ; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1313 ; GFX9-NEXT: s_lshl_b32 s1, s7, 24
1314 ; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1315 ; GFX9-NEXT: s_lshl_b32 s1, s7, 28
1316 ; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60
1317 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60
1318 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1319 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1320 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3
1321 ; GFX9-NEXT: v_mov_b32_e32 v3, s32
1322 ; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2
1323 ; GFX9-NEXT: v_mov_b32_e32 v3, s30
1324 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2
1325 ; GFX9-NEXT: v_mov_b32_e32 v3, s28
1326 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2
1327 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1328 ; GFX9-NEXT: v_mov_b32_e32 v3, s26
1329 ; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2
1330 ; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1331 ; GFX9-NEXT: v_mov_b32_e32 v3, s24
1332 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2
1333 ; GFX9-NEXT: v_mov_b32_e32 v3, s22
1334 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2
1335 ; GFX9-NEXT: v_mov_b32_e32 v3, s20
1336 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
1337 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1338 ; GFX9-NEXT: s_endpgm
1340 ; GFX9-DL-LABEL: idot8_acc32_vecMul:
1341 ; GFX9-DL: ; %bb.0: ; %entry
1342 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1343 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1344 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1345 ; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0
1346 ; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0
1347 ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1348 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1349 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1350 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1351 ; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1352 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4
1353 ; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1354 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16
1355 ; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60
1356 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20
1357 ; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60
1358 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24
1359 ; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60
1360 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28
1361 ; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8
1362 ; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12
1363 ; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
1364 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4
1365 ; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60
1366 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8
1367 ; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60
1368 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12
1369 ; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1370 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16
1371 ; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1372 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20
1373 ; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1374 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24
1375 ; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1376 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28
1377 ; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60
1378 ; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60
1379 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
1380 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
1381 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3
1382 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32
1383 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2
1384 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30
1385 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2
1386 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28
1387 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2
1388 ; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1389 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26
1390 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2
1391 ; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1392 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24
1393 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2
1394 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22
1395 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2
1396 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20
1397 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
1398 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1399 ; GFX9-DL-NEXT: s_endpgm
1400 <8 x i4> addrspace(1)* %src2,
1401 i32 addrspace(1)* nocapture %dst) {
1403 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1404 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1406 %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
1407 %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
1409 %mul = mul <8 x i32> %cvec1, %cvec2
1410 %mul0 = extractelement <8 x i32> %mul, i64 0
1411 %mul1 = extractelement <8 x i32> %mul, i64 1
1412 %mul2 = extractelement <8 x i32> %mul, i64 2
1413 %mul3 = extractelement <8 x i32> %mul, i64 3
1414 %mul4 = extractelement <8 x i32> %mul, i64 4
1415 %mul5 = extractelement <8 x i32> %mul, i64 5
1416 %mul6 = extractelement <8 x i32> %mul, i64 6
1417 %mul7 = extractelement <8 x i32> %mul, i64 7
1419 %acc = load i32, i32 addrspace(1)* %dst, align 4
1420 %add1 = add i32 %mul0, %acc
1421 %add2 = add i32 %add1, %mul1
1422 %add3 = add i32 %add2, %mul2
1423 %add4 = add i32 %add3, %mul3
1424 %add5 = add i32 %add4, %mul4
1425 %add6 = add i32 %add5, %mul5
1426 %add7 = add i32 %add6, %mul6
1427 %add8 = add i32 %add7, %mul7
1429 store i32 %add8, i32 addrspace(1)* %dst, align 4
1433 ; TODO: Support this pattern.
1434 define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
1435 ; GFX7-LABEL: idot8_acc16_vecMul:
1436 ; GFX7: ; %bb.0: ; %entry
1437 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1438 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1439 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1440 ; GFX7-NEXT: s_mov_b32 s6, -1
1441 ; GFX7-NEXT: s_mov_b32 s0, 0xffff
1442 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1443 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
1444 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
1445 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0
1446 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1447 ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40010
1448 ; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40014
1449 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x40010
1450 ; GFX7-NEXT: s_bfe_i32 s16, s2, 0x40014
1451 ; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40018
1452 ; GFX7-NEXT: s_ashr_i32 s18, s2, 28
1453 ; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40000
1454 ; GFX7-NEXT: s_bfe_i32 s20, s2, 0x40004
1455 ; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40008
1456 ; GFX7-NEXT: s_bfe_i32 s2, s2, 0x4000c
1457 ; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40018
1458 ; GFX7-NEXT: s_ashr_i32 s11, s1, 28
1459 ; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40000
1460 ; GFX7-NEXT: v_mov_b32_e32 v4, s19
1461 ; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40004
1462 ; GFX7-NEXT: v_mov_b32_e32 v3, s20
1463 ; GFX7-NEXT: s_bfe_i32 s14, s1, 0x40008
1464 ; GFX7-NEXT: v_mov_b32_e32 v2, s21
1465 ; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c
1466 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
1467 ; GFX7-NEXT: v_mov_b32_e32 v5, s18
1468 ; GFX7-NEXT: v_mov_b32_e32 v6, s17
1469 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s1, v1
1470 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2
1471 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3
1472 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4
1473 ; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5
1474 ; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6
1475 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1476 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2
1477 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1478 ; GFX7-NEXT: v_and_b32_e32 v4, s0, v4
1479 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
1480 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
1481 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
1482 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6
1483 ; GFX7-NEXT: v_mov_b32_e32 v7, s16
1484 ; GFX7-NEXT: v_mov_b32_e32 v8, s15
1485 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v5
1486 ; GFX7-NEXT: v_alignbit_b32 v5, v1, v2, 16
1487 ; GFX7-NEXT: v_mul_i32_i24_e32 v7, s9, v7
1488 ; GFX7-NEXT: v_mul_i32_i24_e32 v8, s8, v8
1489 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
1490 ; GFX7-NEXT: v_and_b32_e32 v8, s0, v8
1491 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1492 ; GFX7-NEXT: v_or_b32_e32 v4, v8, v7
1493 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4
1494 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v3
1495 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1496 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1497 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0
1498 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1499 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0
1500 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
1501 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v7, v0
1502 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
1503 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v8, v0
1504 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
1505 ; GFX7-NEXT: s_endpgm
1507 ; GFX8-LABEL: idot8_acc16_vecMul:
1508 ; GFX8: ; %bb.0: ; %entry
1509 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1510 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1511 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1512 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1513 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1514 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1515 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1516 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
1517 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1518 ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2
1519 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4
1520 ; GFX8-NEXT: s_lshr_b32 s0, s2, 4
1521 ; GFX8-NEXT: s_lshr_b32 s1, s2, 8
1522 ; GFX8-NEXT: s_lshr_b32 s5, s4, 4
1523 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8
1524 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1
1525 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0
1526 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6
1527 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
1528 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
1529 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
1530 ; GFX8-NEXT: s_lshr_b32 s0, s2, 12
1531 ; GFX8-NEXT: s_lshr_b32 s1, s4, 12
1532 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1533 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1534 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1535 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1536 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0
1537 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1
1538 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16
1539 ; GFX8-NEXT: s_lshr_b32 s6, s4, 16
1540 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7
1541 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5
1542 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6
1543 ; GFX8-NEXT: s_lshr_b32 s0, s2, 20
1544 ; GFX8-NEXT: s_lshr_b32 s1, s4, 20
1545 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1546 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1547 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s0
1548 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s1
1549 ; GFX8-NEXT: s_lshr_b32 s5, s2, 24
1550 ; GFX8-NEXT: s_lshr_b32 s6, s4, 24
1551 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1552 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1553 ; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5
1554 ; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6
1555 ; GFX8-NEXT: s_lshr_b32 s0, s2, 28
1556 ; GFX8-NEXT: s_lshr_b32 s1, s4, 28
1557 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1558 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1559 ; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0
1560 ; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1
1561 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
1562 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
1563 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
1564 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
1565 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1566 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
1567 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2
1568 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1569 ; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2
1570 ; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2
1571 ; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2
1572 ; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2
1573 ; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2
1574 ; GFX8-NEXT: flat_store_short v[0:1], v2
1575 ; GFX8-NEXT: s_endpgm
1577 ; GFX9-LABEL: idot8_acc16_vecMul:
1578 ; GFX9: ; %bb.0: ; %entry
1579 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1580 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1581 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1582 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
1583 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
1584 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1585 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1586 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off
1587 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1588 ; GFX9-NEXT: s_and_b32 s0, s2, 15
1589 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
1590 ; GFX9-NEXT: s_and_b32 s5, s4, 15
1591 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004
1592 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
1593 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6
1594 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
1595 ; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
1596 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1597 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6
1598 ; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010
1599 ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014
1600 ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
1601 ; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018
1602 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10
1603 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28
1604 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
1605 ; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
1606 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2
1607 ; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008
1608 ; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c
1609 ; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1610 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8
1611 ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1612 ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
1613 ; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1]
1614 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7
1615 ; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010
1616 ; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014
1617 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12
1618 ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1619 ; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
1620 ; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1]
1621 ; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018
1622 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28
1623 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
1624 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4
1625 ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
1626 ; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
1627 ; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1]
1628 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9
1629 ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
1630 ; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1]
1631 ; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10
1632 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1633 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
1634 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1635 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1636 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1637 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
1638 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1639 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
1640 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1641 ; GFX9-NEXT: global_store_short v[0:1], v2, off
1642 ; GFX9-NEXT: s_endpgm
1644 ; GFX9-DL-LABEL: idot8_acc16_vecMul:
1645 ; GFX9-DL: ; %bb.0: ; %entry
1646 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1647 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1648 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1649 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1650 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1651 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1652 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1653 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
1654 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1655 ; GFX9-DL-NEXT: s_and_b32 s0, s2, 15
1656 ; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
1657 ; GFX9-DL-NEXT: s_and_b32 s5, s4, 15
1658 ; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1659 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
1660 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6
1661 ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
1662 ; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
1663 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1664 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6
1665 ; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010
1666 ; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014
1667 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
1668 ; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018
1669 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10
1670 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
1671 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
1672 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
1673 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2
1674 ; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008
1675 ; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c
1676 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1677 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8
1678 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1679 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
1680 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1]
1681 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7
1682 ; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010
1683 ; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014
1684 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12
1685 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1686 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
1687 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1]
1688 ; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018
1689 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
1690 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
1691 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4
1692 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
1693 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
1694 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1]
1695 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9
1696 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
1697 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1]
1698 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10
1699 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1700 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
1701 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1702 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1703 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1704 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
1705 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1706 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6
1707 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1708 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
1709 ; GFX9-DL-NEXT: s_endpgm
1710 <8 x i4> addrspace(1)* %src2,
1711 i16 addrspace(1)* nocapture %dst) {
1713 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1714 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1716 %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
1717 %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
1719 %mul = mul <8 x i16> %cvec1, %cvec2
1720 %mul0 = extractelement <8 x i16> %mul, i64 0
1721 %mul1 = extractelement <8 x i16> %mul, i64 1
1722 %mul2 = extractelement <8 x i16> %mul, i64 2
1723 %mul3 = extractelement <8 x i16> %mul, i64 3
1724 %mul4 = extractelement <8 x i16> %mul, i64 4
1725 %mul5 = extractelement <8 x i16> %mul, i64 5
1726 %mul6 = extractelement <8 x i16> %mul, i64 6
1727 %mul7 = extractelement <8 x i16> %mul, i64 7
1729 %acc = load i16, i16 addrspace(1)* %dst, align 4
1730 %add1 = add i16 %mul0, %acc
1731 %add2 = add i16 %add1, %mul1
1732 %add3 = add i16 %add2, %mul2
1733 %add4 = add i16 %add3, %mul3
1734 %add5 = add i16 %add4, %mul4
1735 %add6 = add i16 %add5, %mul5
1736 %add7 = add i16 %add6, %mul6
1737 %add8 = add i16 %add7, %mul7
1739 store i16 %add8, i16 addrspace(1)* %dst, align 4
1743 ; TODO: Support this pattern.
1744 define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
1745 ; GFX7-LABEL: idot8_acc8_vecMul:
1746 ; GFX7: ; %bb.0: ; %entry
1747 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1748 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1749 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1750 ; GFX7-NEXT: s_mov_b32 s6, -1
1751 ; GFX7-NEXT: s_movk_i32 s0, 0xff
1752 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1753 ; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
1754 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
1755 ; GFX7-NEXT: s_load_dword s8, s[10:11], 0x0
1756 ; GFX7-NEXT: s_mov_b32 s1, 0xffff
1757 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1758 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000
1759 ; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004
1760 ; GFX7-NEXT: s_bfe_i32 s16, s8, 0x40000
1761 ; GFX7-NEXT: s_bfe_i32 s17, s8, 0x40004
1762 ; GFX7-NEXT: s_bfe_i32 s18, s8, 0x40008
1763 ; GFX7-NEXT: s_bfe_i32 s19, s8, 0x4000c
1764 ; GFX7-NEXT: s_bfe_i32 s20, s8, 0x40010
1765 ; GFX7-NEXT: s_bfe_i32 s21, s8, 0x40014
1766 ; GFX7-NEXT: s_bfe_i32 s22, s8, 0x40018
1767 ; GFX7-NEXT: s_ashr_i32 s8, s8, 28
1768 ; GFX7-NEXT: v_mov_b32_e32 v7, s17
1769 ; GFX7-NEXT: v_mov_b32_e32 v8, s16
1770 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40008
1771 ; GFX7-NEXT: v_mov_b32_e32 v6, s18
1772 ; GFX7-NEXT: s_bfe_i32 s12, s2, 0x4000c
1773 ; GFX7-NEXT: v_mov_b32_e32 v5, s19
1774 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40010
1775 ; GFX7-NEXT: v_mov_b32_e32 v4, s20
1776 ; GFX7-NEXT: s_bfe_i32 s14, s2, 0x40014
1777 ; GFX7-NEXT: v_mov_b32_e32 v3, s21
1778 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x40018
1779 ; GFX7-NEXT: v_mov_b32_e32 v2, s22
1780 ; GFX7-NEXT: s_ashr_i32 s2, s2, 28
1781 ; GFX7-NEXT: v_mov_b32_e32 v1, s8
1782 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s2, v1
1783 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s15, v2
1784 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s14, v3
1785 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s13, v4
1786 ; GFX7-NEXT: v_mul_i32_i24_e32 v5, s12, v5
1787 ; GFX7-NEXT: v_mul_i32_i24_e32 v6, s11, v6
1788 ; GFX7-NEXT: v_mul_i32_i24_e32 v7, s10, v7
1789 ; GFX7-NEXT: v_mul_i32_i24_e32 v8, s9, v8
1790 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1791 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2
1792 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1793 ; GFX7-NEXT: v_and_b32_e32 v4, s0, v4
1794 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
1795 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6
1796 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7
1797 ; GFX7-NEXT: v_and_b32_e32 v8, s0, v8
1798 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
1799 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
1800 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v5
1801 ; GFX7-NEXT: v_or_b32_e32 v4, v8, v7
1802 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1803 ; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
1804 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1805 ; GFX7-NEXT: v_and_b32_e32 v4, s1, v4
1806 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
1807 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
1808 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8
1809 ; GFX7-NEXT: v_alignbit_b32 v4, v1, v2, 16
1810 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2
1811 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v1
1812 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1813 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v1
1814 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1815 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1816 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
1817 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
1818 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0
1819 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1820 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0
1821 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v7, v0
1822 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v8, v0
1823 ; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
1824 ; GFX7-NEXT: s_endpgm
1826 ; GFX8-LABEL: idot8_acc8_vecMul:
1827 ; GFX8: ; %bb.0: ; %entry
1828 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1829 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1830 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1831 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1832 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1833 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1834 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1835 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
1836 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1837 ; GFX8-NEXT: s_lshr_b32 s0, s2, 4
1838 ; GFX8-NEXT: s_lshr_b32 s1, s2, 12
1839 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8
1840 ; GFX8-NEXT: s_lshr_b32 s6, s4, 4
1841 ; GFX8-NEXT: s_lshr_b32 s7, s4, 12
1842 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8
1843 ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s5
1844 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1
1845 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s0
1846 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8
1847 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7
1848 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6
1849 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s2
1850 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4
1851 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
1852 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1853 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
1854 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1855 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1856 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1857 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1858 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1859 ; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1860 ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1861 ; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1862 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1863 ; GFX8-NEXT: s_lshr_b32 s0, s2, 20
1864 ; GFX8-NEXT: s_lshr_b32 s1, s2, 16
1865 ; GFX8-NEXT: s_lshr_b32 s5, s2, 28
1866 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24
1867 ; GFX8-NEXT: s_lshr_b32 s6, s4, 20
1868 ; GFX8-NEXT: s_lshr_b32 s7, s4, 16
1869 ; GFX8-NEXT: s_lshr_b32 s8, s4, 28
1870 ; GFX8-NEXT: s_lshr_b32 s4, s4, 24
1871 ; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1872 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1873 ; GFX8-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1874 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s2
1875 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
1876 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s1
1877 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s0
1878 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4
1879 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s8
1880 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s7
1881 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s6
1882 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1883 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1884 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1885 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1886 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1887 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1888 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1889 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1890 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3
1891 ; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1892 ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v8, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1893 ; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1894 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v10, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1895 ; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1896 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1897 ; GFX8-NEXT: v_or_b32_sdwa v4, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1898 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v4
1899 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1900 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1901 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
1902 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
1903 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1904 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
1905 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
1906 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1907 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1908 ; GFX8-NEXT: flat_store_byte v[0:1], v2
1909 ; GFX8-NEXT: s_endpgm
1911 ; GFX9-LABEL: idot8_acc8_vecMul:
1912 ; GFX9: ; %bb.0: ; %entry
1913 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1914 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1915 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1916 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
1917 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
1918 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1919 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1920 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
1921 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1922 ; GFX9-NEXT: s_lshr_b32 s0, s2, 4
1923 ; GFX9-NEXT: s_lshr_b32 s1, s2, 12
1924 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8
1925 ; GFX9-NEXT: s_lshr_b32 s6, s4, 4
1926 ; GFX9-NEXT: s_lshr_b32 s7, s4, 12
1927 ; GFX9-NEXT: s_lshr_b32 s8, s4, 8
1928 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5
1929 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
1930 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s0
1931 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8
1932 ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7
1933 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6
1934 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s2
1935 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
1936 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
1937 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1938 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
1939 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1940 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1941 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1942 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1943 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1944 ; GFX9-NEXT: v_mul_lo_u16_e32 v6, v6, v10
1945 ; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1946 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1947 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v7
1948 ; GFX9-NEXT: s_lshr_b32 s0, s2, 20
1949 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16
1950 ; GFX9-NEXT: s_lshr_b32 s5, s2, 28
1951 ; GFX9-NEXT: s_lshr_b32 s2, s2, 24
1952 ; GFX9-NEXT: s_lshr_b32 s6, s4, 20
1953 ; GFX9-NEXT: s_lshr_b32 s7, s4, 16
1954 ; GFX9-NEXT: s_lshr_b32 s8, s4, 28
1955 ; GFX9-NEXT: s_lshr_b32 s4, s4, 24
1956 ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1957 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1958 ; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1959 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s2
1960 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5
1961 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s1
1962 ; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s0
1963 ; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s4
1964 ; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s8
1965 ; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s7
1966 ; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s6
1967 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1968 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1969 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1970 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1971 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1972 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15
1973 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1974 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16
1975 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v3
1976 ; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v12, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1977 ; GFX9-NEXT: v_mul_lo_u16_e32 v11, v11, v15
1978 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1979 ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v13
1980 ; GFX9-NEXT: v_or_b32_sdwa v7, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1981 ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1982 ; GFX9-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1983 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1984 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
1985 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
1986 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
1987 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1988 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
1989 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4
1990 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
1991 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1992 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1993 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
1994 ; GFX9-NEXT: s_endpgm
1996 ; GFX9-DL-LABEL: idot8_acc8_vecMul:
1997 ; GFX9-DL: ; %bb.0: ; %entry
1998 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1999 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2001 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2002 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2003 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2004 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2005 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
2006 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2007 ; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4
2008 ; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12
2009 ; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 8
2010 ; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 4
2011 ; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12
2012 ; GFX9-DL-NEXT: s_lshr_b32 s8, s4, 8
2013 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5
2014 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
2015 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s0
2016 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8
2017 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7
2018 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
2019 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s2
2020 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
2021 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2022 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2023 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
2024 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2025 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2026 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2027 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2028 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2029 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, v6, v10
2030 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2031 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2032 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v7
2033 ; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 20
2034 ; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16
2035 ; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28
2036 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
2037 ; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 20
2038 ; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16
2039 ; GFX9-DL-NEXT: s_lshr_b32 s8, s4, 28
2040 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24
2041 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2042 ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2043 ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2044 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s2
2045 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5
2046 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s1
2047 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s0
2048 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s4
2049 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s8
2050 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s7
2051 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s6
2052 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2053 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2054 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2055 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2056 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2057 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2058 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2059 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16
2060 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
2061 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v12, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2062 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v11, v11, v15
2063 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2064 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v13
2065 ; GFX9-DL-NEXT: v_or_b32_sdwa v7, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2066 ; GFX9-DL-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2067 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2068 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2069 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
2070 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
2071 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2072 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2073 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
2074 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4
2075 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
2076 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2077 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2078 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
2079 ; GFX9-DL-NEXT: s_endpgm
2080 <8 x i4> addrspace(1)* %src2,
2081 i8 addrspace(1)* nocapture %dst) {
2083 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
2084 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
2086 %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
2087 %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
2089 %mul = mul <8 x i8> %cvec1, %cvec2
2090 %mul0 = extractelement <8 x i8> %mul, i64 0
2091 %mul1 = extractelement <8 x i8> %mul, i64 1
2092 %mul2 = extractelement <8 x i8> %mul, i64 2
2093 %mul3 = extractelement <8 x i8> %mul, i64 3
2094 %mul4 = extractelement <8 x i8> %mul, i64 4
2095 %mul5 = extractelement <8 x i8> %mul, i64 5
2096 %mul6 = extractelement <8 x i8> %mul, i64 6
2097 %mul7 = extractelement <8 x i8> %mul, i64 7
2099 %acc = load i8, i8 addrspace(1)* %dst, align 4
2100 %add1 = add i8 %mul0, %acc
2101 %add2 = add i8 %add1, %mul1
2102 %add3 = add i8 %add2, %mul2
2103 %add4 = add i8 %add3, %mul3
2104 %add5 = add i8 %add4, %mul4
2105 %add6 = add i8 %add5, %mul5
2106 %add7 = add i8 %add6, %mul6
2107 %add8 = add i8 %add7, %mul7
2109 store i8 %add8, i8 addrspace(1)* %dst, align 4