1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
9 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
10 ; GFX7-LABEL: idot8_acc32:
11 ; GFX7: ; %bb.0: ; %entry
12 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
13 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
14 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
15 ; GFX7-NEXT: s_mov_b32 s6, -1
16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17 ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
18 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0
19 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
20 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
21 ; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000
22 ; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000
23 ; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004
24 ; GFX7-NEXT: v_mov_b32_e32 v0, s9
25 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
26 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
27 ; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004
28 ; GFX7-NEXT: v_mov_b32_e32 v1, s11
29 ; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008
30 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0
31 ; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008
32 ; GFX7-NEXT: v_mov_b32_e32 v1, s13
33 ; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c
34 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0
35 ; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c
36 ; GFX7-NEXT: v_mov_b32_e32 v1, s15
37 ; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010
38 ; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0
39 ; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010
40 ; GFX7-NEXT: v_mov_b32_e32 v1, s17
41 ; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014
42 ; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018
43 ; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0
44 ; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014
45 ; GFX7-NEXT: v_mov_b32_e32 v1, s19
46 ; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018
47 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0
48 ; GFX7-NEXT: v_mov_b32_e32 v1, s21
49 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
50 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0
51 ; GFX7-NEXT: s_ashr_i32 s0, s0, 28
52 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
53 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0
54 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
57 ; GFX8-LABEL: idot8_acc32:
58 ; GFX8: ; %bb.0: ; %entry
59 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
60 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
61 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
62 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
63 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
64 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
65 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
66 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
67 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
69 ; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
70 ; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004
71 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
72 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
73 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
74 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004
75 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
76 ; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008
77 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2
78 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008
79 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
80 ; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c
81 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2
82 ; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c
83 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
84 ; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010
85 ; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2
86 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010
87 ; GFX8-NEXT: v_mov_b32_e32 v3, s13
88 ; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014
89 ; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018
90 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2
91 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014
92 ; GFX8-NEXT: v_mov_b32_e32 v3, s15
93 ; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018
94 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
95 ; GFX8-NEXT: v_mov_b32_e32 v3, s17
96 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28
97 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2
98 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28
99 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
100 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
101 ; GFX8-NEXT: flat_store_dword v[0:1], v2
102 ; GFX8-NEXT: s_endpgm
104 ; GFX9-LABEL: idot8_acc32:
105 ; GFX9: ; %bb.0: ; %entry
106 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
107 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
108 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
110 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
111 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0
112 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
113 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
114 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
116 ; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
117 ; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004
118 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
119 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
120 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3
121 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004
122 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
123 ; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008
124 ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2
125 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008
126 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
127 ; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c
128 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2
129 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c
130 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
131 ; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010
132 ; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2
133 ; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
134 ; GFX9-NEXT: v_mov_b32_e32 v3, s13
135 ; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014
136 ; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018
137 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2
138 ; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
139 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
140 ; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
141 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2
142 ; GFX9-NEXT: v_mov_b32_e32 v3, s17
143 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28
144 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2
145 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28
146 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
147 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2
148 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
149 ; GFX9-NEXT: s_endpgm
151 ; GFX9-DL-LABEL: idot8_acc32:
152 ; GFX9-DL: ; %bb.0: ; %entry
153 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
154 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
155 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
157 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
158 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
159 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
160 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
161 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
163 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
164 ; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v2, v3
165 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
166 ; GFX9-DL-NEXT: s_endpgm
168 ; GFX10-DL-LABEL: idot8_acc32:
169 ; GFX10-DL: ; %bb.0: ; %entry
170 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
171 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
172 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
173 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
175 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
176 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
177 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
178 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
179 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
180 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
181 ; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s2, s4, v2
182 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
183 ; GFX10-DL-NEXT: s_endpgm
184 <8 x i4> addrspace(1)* %src2,
185 i32 addrspace(1)* nocapture %dst) {
187 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
188 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
190 %v1e0 = extractelement <8 x i4> %vec1, i64 0
191 %cv1e0 = sext i4 %v1e0 to i32
192 %v2e0 = extractelement <8 x i4> %vec2, i64 0
193 %cv2e0 = sext i4 %v2e0 to i32
194 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
196 %v1e1 = extractelement <8 x i4> %vec1, i64 1
197 %cv1e1 = sext i4 %v1e1 to i32
198 %v2e1 = extractelement <8 x i4> %vec2, i64 1
199 %cv2e1 = sext i4 %v2e1 to i32
200 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
202 %v1e2 = extractelement <8 x i4> %vec1, i64 2
203 %cv1e2 = sext i4 %v1e2 to i32
204 %v2e2 = extractelement <8 x i4> %vec2, i64 2
205 %cv2e2 = sext i4 %v2e2 to i32
206 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
208 %v1e3 = extractelement <8 x i4> %vec1, i64 3
209 %cv1e3 = sext i4 %v1e3 to i32
210 %v2e3 = extractelement <8 x i4> %vec2, i64 3
211 %cv2e3 = sext i4 %v2e3 to i32
212 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
214 %v1e4 = extractelement <8 x i4> %vec1, i64 4
215 %cv1e4 = sext i4 %v1e4 to i32
216 %v2e4 = extractelement <8 x i4> %vec2, i64 4
217 %cv2e4 = sext i4 %v2e4 to i32
218 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
220 %v1e5 = extractelement <8 x i4> %vec1, i64 5
221 %cv1e5 = sext i4 %v1e5 to i32
222 %v2e5 = extractelement <8 x i4> %vec2, i64 5
223 %cv2e5 = sext i4 %v2e5 to i32
224 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
226 %v1e6 = extractelement <8 x i4> %vec1, i64 6
227 %cv1e6 = sext i4 %v1e6 to i32
228 %v2e6 = extractelement <8 x i4> %vec2, i64 6
229 %cv2e6 = sext i4 %v2e6 to i32
230 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
232 %v1e7 = extractelement <8 x i4> %vec1, i64 7
233 %cv1e7 = sext i4 %v1e7 to i32
234 %v2e7 = extractelement <8 x i4> %vec2, i64 7
235 %cv2e7 = sext i4 %v2e7 to i32
236 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
238 %acc = load i32, i32 addrspace(1)* %dst, align 4
239 %add1 = add i32 %mul0, %acc
240 %add2 = add i32 %add1, %mul1
241 %add3 = add i32 %add2, %mul2
242 %add4 = add i32 %add3, %mul3
243 %add5 = add i32 %add4, %mul4
244 %add6 = add i32 %add5, %mul5
245 %add7 = add i32 %add6, %mul6
246 %add8 = add i32 %add7, %mul7
248 store i32 %add8, i32 addrspace(1)* %dst, align 4
252 ; TODO: Once the unnecessary zero extentions of the elements are removed;
253 ; pattern recognizer will kick in.
254 define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
255 ; GFX7-LABEL: idot8_acc16:
256 ; GFX7: ; %bb.0: ; %entry
257 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
258 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
259 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
260 ; GFX7-NEXT: s_mov_b32 s6, -1
261 ; GFX7-NEXT: s_mov_b32 s0, 0xffff
262 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
264 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
265 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0
266 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000
268 ; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004
269 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000
270 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004
271 ; GFX7-NEXT: s_and_b32 s9, s9, s0
272 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008
273 ; GFX7-NEXT: s_and_b32 s11, s11, s0
274 ; GFX7-NEXT: s_and_b32 s8, s8, s0
275 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
276 ; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008
277 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c
278 ; GFX7-NEXT: s_and_b32 s13, s13, s0
279 ; GFX7-NEXT: s_and_b32 s10, s10, s0
280 ; GFX7-NEXT: v_mov_b32_e32 v2, s11
281 ; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c
282 ; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010
283 ; GFX7-NEXT: s_and_b32 s15, s15, s0
284 ; GFX7-NEXT: s_and_b32 s12, s12, s0
285 ; GFX7-NEXT: v_mov_b32_e32 v3, s13
286 ; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010
287 ; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014
288 ; GFX7-NEXT: s_and_b32 s17, s17, s0
289 ; GFX7-NEXT: s_and_b32 s14, s14, s0
290 ; GFX7-NEXT: v_mov_b32_e32 v4, s15
291 ; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018
292 ; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014
293 ; GFX7-NEXT: s_and_b32 s19, s19, s0
294 ; GFX7-NEXT: s_and_b32 s16, s16, s0
295 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
296 ; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018
297 ; GFX7-NEXT: s_ashr_i32 s2, s2, 28
298 ; GFX7-NEXT: s_and_b32 s21, s21, s0
299 ; GFX7-NEXT: s_and_b32 s18, s18, s0
300 ; GFX7-NEXT: v_mov_b32_e32 v6, s19
301 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
302 ; GFX7-NEXT: s_and_b32 s20, s20, s0
303 ; GFX7-NEXT: s_and_b32 s2, s2, s0
304 ; GFX7-NEXT: v_mov_b32_e32 v7, s21
305 ; GFX7-NEXT: s_and_b32 s0, s1, s0
306 ; GFX7-NEXT: s_waitcnt vmcnt(0)
307 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
308 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0
309 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0
310 ; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0
311 ; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0
312 ; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0
313 ; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0
314 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
315 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0
316 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
317 ; GFX7-NEXT: s_endpgm
319 ; GFX8-LABEL: idot8_acc16:
320 ; GFX8: ; %bb.0: ; %entry
321 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
322 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
323 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
324 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
325 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
326 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
327 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
328 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
329 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
331 ; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
332 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
333 ; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004
334 ; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008
335 ; GFX8-NEXT: s_lshr_b32 s1, s2, 12
336 ; GFX8-NEXT: s_lshr_b32 s7, s4, 12
337 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004
338 ; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008
339 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
340 ; GFX8-NEXT: v_mov_b32_e32 v7, s5
341 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1
342 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7
343 ; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4
344 ; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010
345 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
346 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
347 ; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014
348 ; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010
349 ; GFX8-NEXT: v_mov_b32_e32 v8, s10
350 ; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018
351 ; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014
352 ; GFX8-NEXT: v_mov_b32_e32 v9, s12
353 ; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018
354 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28
355 ; GFX8-NEXT: v_mov_b32_e32 v10, s14
356 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28
357 ; GFX8-NEXT: s_waitcnt vmcnt(0)
358 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
359 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2
360 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
361 ; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
362 ; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
363 ; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
364 ; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
365 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
366 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
367 ; GFX8-NEXT: flat_store_short v[0:1], v2
368 ; GFX8-NEXT: s_endpgm
370 ; GFX9-LABEL: idot8_acc16:
371 ; GFX9: ; %bb.0: ; %entry
372 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
373 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
374 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
375 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
376 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
377 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
378 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
379 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off
380 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
382 ; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
383 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
384 ; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004
385 ; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008
386 ; GFX9-NEXT: s_lshr_b32 s1, s2, 12
387 ; GFX9-NEXT: s_lshr_b32 s7, s4, 12
388 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
389 ; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008
390 ; GFX9-NEXT: v_mov_b32_e32 v4, s6
391 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
392 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1
393 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7
394 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4
395 ; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010
396 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
397 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
398 ; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014
399 ; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010
400 ; GFX9-NEXT: v_mov_b32_e32 v8, s10
401 ; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018
402 ; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014
403 ; GFX9-NEXT: v_mov_b32_e32 v9, s12
404 ; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018
405 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28
406 ; GFX9-NEXT: v_mov_b32_e32 v10, s14
407 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28
408 ; GFX9-NEXT: s_waitcnt vmcnt(0)
409 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
410 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2
411 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
412 ; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2
413 ; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
414 ; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
415 ; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
416 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
417 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2
418 ; GFX9-NEXT: global_store_short v[0:1], v2, off
419 ; GFX9-NEXT: s_endpgm
421 ; GFX9-DL-LABEL: idot8_acc16:
422 ; GFX9-DL: ; %bb.0: ; %entry
423 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
424 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
425 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
427 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
428 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
429 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
430 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
431 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
432 ; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
433 ; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
434 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
435 ; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004
436 ; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008
437 ; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12
438 ; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12
439 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
440 ; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008
441 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
442 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5
443 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
444 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7
445 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4
446 ; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
447 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
448 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
449 ; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014
450 ; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010
451 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10
452 ; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018
453 ; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014
454 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12
455 ; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018
456 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28
457 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14
458 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
459 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
460 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
461 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2
462 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
463 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2
464 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
465 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
466 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
467 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
468 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
469 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
470 ; GFX9-DL-NEXT: s_endpgm
472 ; GFX10-DL-LABEL: idot8_acc16:
473 ; GFX10-DL: ; %bb.0: ; %entry
474 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
475 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
476 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
477 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
478 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
479 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
480 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
481 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
482 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
483 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
484 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12
486 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12
487 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000
488 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000
489 ; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004
490 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
491 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
492 ; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40004
493 ; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40008
494 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008
495 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
496 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
497 ; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40010
498 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
499 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s1, s8
500 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
501 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
502 ; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40014
503 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014
504 ; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40018
505 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
506 ; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2
507 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018
508 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28
509 ; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28
510 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
511 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3
512 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s0, v3
513 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
514 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
515 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s9, s10, v2
516 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s8, v2
517 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s11, s12, v2
518 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
519 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
520 ; GFX10-DL-NEXT: s_endpgm
521 <8 x i4> addrspace(1)* %src2,
522 i16 addrspace(1)* nocapture %dst) {
524 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
525 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
527 %v1e0 = extractelement <8 x i4> %vec1, i64 0
528 %cv1e0 = sext i4 %v1e0 to i16
529 %v2e0 = extractelement <8 x i4> %vec2, i64 0
530 %cv2e0 = sext i4 %v2e0 to i16
531 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
533 %v1e1 = extractelement <8 x i4> %vec1, i64 1
534 %cv1e1 = sext i4 %v1e1 to i16
535 %v2e1 = extractelement <8 x i4> %vec2, i64 1
536 %cv2e1 = sext i4 %v2e1 to i16
537 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
539 %v1e2 = extractelement <8 x i4> %vec1, i64 2
540 %cv1e2 = sext i4 %v1e2 to i16
541 %v2e2 = extractelement <8 x i4> %vec2, i64 2
542 %cv2e2 = sext i4 %v2e2 to i16
543 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
545 %v1e3 = extractelement <8 x i4> %vec1, i64 3
546 %cv1e3 = sext i4 %v1e3 to i16
547 %v2e3 = extractelement <8 x i4> %vec2, i64 3
548 %cv2e3 = sext i4 %v2e3 to i16
549 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
551 %v1e4 = extractelement <8 x i4> %vec1, i64 4
552 %cv1e4 = sext i4 %v1e4 to i16
553 %v2e4 = extractelement <8 x i4> %vec2, i64 4
554 %cv2e4 = sext i4 %v2e4 to i16
555 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
557 %v1e5 = extractelement <8 x i4> %vec1, i64 5
558 %cv1e5 = sext i4 %v1e5 to i16
559 %v2e5 = extractelement <8 x i4> %vec2, i64 5
560 %cv2e5 = sext i4 %v2e5 to i16
561 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
563 %v1e6 = extractelement <8 x i4> %vec1, i64 6
564 %cv1e6 = sext i4 %v1e6 to i16
565 %v2e6 = extractelement <8 x i4> %vec2, i64 6
566 %cv2e6 = sext i4 %v2e6 to i16
567 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
569 %v1e7 = extractelement <8 x i4> %vec1, i64 7
570 %cv1e7 = sext i4 %v1e7 to i16
571 %v2e7 = extractelement <8 x i4> %vec2, i64 7
572 %cv2e7 = sext i4 %v2e7 to i16
573 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
575 %acc = load i16, i16 addrspace(1)* %dst, align 4
576 %add1 = add i16 %mul0, %acc
577 %add2 = add i16 %add1, %mul1
578 %add3 = add i16 %add2, %mul2
579 %add4 = add i16 %add3, %mul3
580 %add5 = add i16 %add4, %mul4
581 %add6 = add i16 %add5, %mul5
582 %add7 = add i16 %add6, %mul6
583 %add8 = add i16 %add7, %mul7
585 store i16 %add8, i16 addrspace(1)* %dst, align 4
589 ; TODO: Support this pattern.
590 define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
591 ; GFX7-LABEL: idot8_acc8:
592 ; GFX7: ; %bb.0: ; %entry
593 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
594 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
595 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
596 ; GFX7-NEXT: s_mov_b32 s6, -1
597 ; GFX7-NEXT: s_movk_i32 s0, 0xff
598 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
600 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
601 ; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0
602 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
603 ; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000
604 ; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004
605 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000
606 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004
607 ; GFX7-NEXT: s_and_b32 s9, s9, s0
608 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008
609 ; GFX7-NEXT: s_and_b32 s11, s11, s0
610 ; GFX7-NEXT: s_and_b32 s8, s8, s0
611 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
612 ; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008
613 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c
614 ; GFX7-NEXT: s_and_b32 s13, s13, s0
615 ; GFX7-NEXT: s_and_b32 s10, s10, s0
616 ; GFX7-NEXT: v_mov_b32_e32 v2, s11
617 ; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c
618 ; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010
619 ; GFX7-NEXT: s_and_b32 s15, s15, s0
620 ; GFX7-NEXT: s_and_b32 s12, s12, s0
621 ; GFX7-NEXT: v_mov_b32_e32 v3, s13
622 ; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010
623 ; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014
624 ; GFX7-NEXT: s_and_b32 s17, s17, s0
625 ; GFX7-NEXT: s_and_b32 s14, s14, s0
626 ; GFX7-NEXT: v_mov_b32_e32 v4, s15
627 ; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018
628 ; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014
629 ; GFX7-NEXT: s_and_b32 s19, s19, s0
630 ; GFX7-NEXT: s_and_b32 s16, s16, s0
631 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
632 ; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018
633 ; GFX7-NEXT: s_ashr_i32 s2, s2, 28
634 ; GFX7-NEXT: s_and_b32 s21, s21, s0
635 ; GFX7-NEXT: s_and_b32 s18, s18, s0
636 ; GFX7-NEXT: v_mov_b32_e32 v6, s19
637 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
638 ; GFX7-NEXT: s_and_b32 s20, s20, s0
639 ; GFX7-NEXT: s_and_b32 s2, s2, s0
640 ; GFX7-NEXT: v_mov_b32_e32 v7, s21
641 ; GFX7-NEXT: s_and_b32 s0, s1, s0
642 ; GFX7-NEXT: s_waitcnt vmcnt(0)
643 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
644 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0
645 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0
646 ; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0
647 ; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0
648 ; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0
649 ; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0
650 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
651 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0
652 ; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
653 ; GFX7-NEXT: s_endpgm
655 ; GFX8-LABEL: idot8_acc8:
656 ; GFX8: ; %bb.0: ; %entry
657 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
658 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
659 ; GFX8-NEXT: s_movk_i32 s2, 0xff
660 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
661 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
662 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
663 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
664 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
665 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
666 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX8-NEXT: s_lshr_b32 s4, s0, 12
668 ; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40000
669 ; GFX8-NEXT: s_lshr_b32 s5, s1, 12
670 ; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40004
671 ; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40008
672 ; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000
673 ; GFX8-NEXT: v_mov_b32_e32 v6, s7
674 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4
675 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5
676 ; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004
677 ; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008
678 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
679 ; GFX8-NEXT: v_mov_b32_e32 v7, s9
680 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
681 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
682 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3
683 ; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010
684 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
685 ; GFX8-NEXT: v_and_b32_e32 v5, s2, v5
686 ; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014
687 ; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010
688 ; GFX8-NEXT: v_mov_b32_e32 v8, s13
689 ; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018
690 ; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014
691 ; GFX8-NEXT: v_mov_b32_e32 v9, s15
692 ; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018
693 ; GFX8-NEXT: s_ashr_i32 s1, s1, 28
694 ; GFX8-NEXT: v_mov_b32_e32 v10, s17
695 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28
696 ; GFX8-NEXT: s_waitcnt vmcnt(0)
697 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2
698 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2
699 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
700 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
701 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2
702 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2
703 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2
704 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
705 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
706 ; GFX8-NEXT: flat_store_byte v[0:1], v2
707 ; GFX8-NEXT: s_endpgm
709 ; GFX9-LABEL: idot8_acc8:
710 ; GFX9: ; %bb.0: ; %entry
711 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
712 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
713 ; GFX9-NEXT: s_movk_i32 s2, 0xff
714 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
715 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
716 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
717 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
718 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
719 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
720 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
721 ; GFX9-NEXT: s_lshr_b32 s4, s0, 12
722 ; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000
723 ; GFX9-NEXT: s_lshr_b32 s5, s1, 12
724 ; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004
725 ; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008
726 ; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000
727 ; GFX9-NEXT: v_mov_b32_e32 v6, s7
728 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4
729 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5
730 ; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004
731 ; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008
732 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
733 ; GFX9-NEXT: v_mov_b32_e32 v7, s9
734 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
735 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
736 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3
737 ; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010
738 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
739 ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
740 ; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014
741 ; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010
742 ; GFX9-NEXT: v_mov_b32_e32 v8, s13
743 ; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018
744 ; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014
745 ; GFX9-NEXT: v_mov_b32_e32 v9, s15
746 ; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018
747 ; GFX9-NEXT: s_ashr_i32 s1, s1, 28
748 ; GFX9-NEXT: v_mov_b32_e32 v10, s17
749 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28
750 ; GFX9-NEXT: s_waitcnt vmcnt(0)
751 ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2
752 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2
753 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
754 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
755 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2
756 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2
757 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2
758 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
759 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
760 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
761 ; GFX9-NEXT: s_endpgm
763 ; GFX9-DL-LABEL: idot8_acc8:
764 ; GFX9-DL: ; %bb.0: ; %entry
765 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
766 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
767 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
768 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
769 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
770 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
771 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
772 ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
773 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
774 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
775 ; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12
776 ; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
777 ; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12
778 ; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
779 ; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
780 ; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
781 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
782 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
783 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5
784 ; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
785 ; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
786 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
787 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
788 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
789 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
790 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3
791 ; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010
792 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
793 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
794 ; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014
795 ; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010
796 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13
797 ; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018
798 ; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014
799 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15
800 ; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018
801 ; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
802 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17
803 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
804 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
805 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2
806 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2
807 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
808 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
809 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2
810 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2
811 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2
812 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
813 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
814 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
815 ; GFX9-DL-NEXT: s_endpgm
817 ; GFX10-DL-LABEL: idot8_acc8:
818 ; GFX10-DL: ; %bb.0: ; %entry
819 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
820 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
821 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
822 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
823 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
824 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
825 ; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
826 ; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
827 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
828 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
829 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
830 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
831 ; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12
832 ; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12
833 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000
834 ; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000
835 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004
836 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
837 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
838 ; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004
839 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40008
840 ; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40008
841 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
842 ; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2
843 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
844 ; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40010
845 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s1, s9
846 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
847 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
848 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014
849 ; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40014
850 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018
851 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
852 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
853 ; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018
854 ; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28
855 ; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28
856 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
857 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s6, s7, v3
858 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s8, s0, v3
859 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
860 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
861 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2
862 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s9, v2
863 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2
864 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
865 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
866 ; GFX10-DL-NEXT: s_endpgm
867 <8 x i4> addrspace(1)* %src2,
868 i8 addrspace(1)* nocapture %dst) {
870 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
871 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
873 %v1e0 = extractelement <8 x i4> %vec1, i64 0
874 %cv1e0 = sext i4 %v1e0 to i8
875 %v2e0 = extractelement <8 x i4> %vec2, i64 0
876 %cv2e0 = sext i4 %v2e0 to i8
877 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
879 %v1e1 = extractelement <8 x i4> %vec1, i64 1
880 %cv1e1 = sext i4 %v1e1 to i8
881 %v2e1 = extractelement <8 x i4> %vec2, i64 1
882 %cv2e1 = sext i4 %v2e1 to i8
883 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
885 %v1e2 = extractelement <8 x i4> %vec1, i64 2
886 %cv1e2 = sext i4 %v1e2 to i8
887 %v2e2 = extractelement <8 x i4> %vec2, i64 2
888 %cv2e2 = sext i4 %v2e2 to i8
889 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
891 %v1e3 = extractelement <8 x i4> %vec1, i64 3
892 %cv1e3 = sext i4 %v1e3 to i8
893 %v2e3 = extractelement <8 x i4> %vec2, i64 3
894 %cv2e3 = sext i4 %v2e3 to i8
895 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
897 %v1e4 = extractelement <8 x i4> %vec1, i64 4
898 %cv1e4 = sext i4 %v1e4 to i8
899 %v2e4 = extractelement <8 x i4> %vec2, i64 4
900 %cv2e4 = sext i4 %v2e4 to i8
901 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
903 %v1e5 = extractelement <8 x i4> %vec1, i64 5
904 %cv1e5 = sext i4 %v1e5 to i8
905 %v2e5 = extractelement <8 x i4> %vec2, i64 5
906 %cv2e5 = sext i4 %v2e5 to i8
907 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
909 %v1e6 = extractelement <8 x i4> %vec1, i64 6
910 %cv1e6 = sext i4 %v1e6 to i8
911 %v2e6 = extractelement <8 x i4> %vec2, i64 6
912 %cv2e6 = sext i4 %v2e6 to i8
913 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
915 %v1e7 = extractelement <8 x i4> %vec1, i64 7
916 %cv1e7 = sext i4 %v1e7 to i8
917 %v2e7 = extractelement <8 x i4> %vec2, i64 7
918 %cv2e7 = sext i4 %v2e7 to i8
919 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
921 %acc = load i8, i8 addrspace(1)* %dst, align 4
922 %add1 = add i8 %mul0, %acc
923 %add2 = add i8 %add1, %mul1
924 %add3 = add i8 %add2, %mul2
925 %add4 = add i8 %add3, %mul3
926 %add5 = add i8 %add4, %mul4
927 %add6 = add i8 %add5, %mul5
928 %add7 = add i8 %add6, %mul6
929 %add8 = add i8 %add7, %mul7
931 store i8 %add8, i8 addrspace(1)* %dst, align 4
935 ; Make sure the pattern is not recognized if there are multiple uses of the
936 ; intermediate multiplications.
937 define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
938 ; GFX7-LABEL: idot8_multiuses_mul1:
939 ; GFX7: ; %bb.0: ; %entry
940 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
941 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
942 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
943 ; GFX7-NEXT: s_mov_b32 s6, -1
944 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
945 ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
946 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0
947 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
948 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
949 ; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000
950 ; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000
951 ; GFX7-NEXT: v_mov_b32_e32 v0, s9
952 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
953 ; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1
954 ; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004
955 ; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004
956 ; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008
957 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
958 ; GFX7-NEXT: v_mov_b32_e32 v2, s11
959 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v2, v0
960 ; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008
961 ; GFX7-NEXT: v_mov_b32_e32 v2, s13
962 ; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c
963 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v2, v0
964 ; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c
965 ; GFX7-NEXT: v_mov_b32_e32 v2, s15
966 ; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010
967 ; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0
968 ; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010
969 ; GFX7-NEXT: v_mov_b32_e32 v2, s17
970 ; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014
971 ; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018
972 ; GFX7-NEXT: v_mad_i32_i24 v0, s16, v2, v0
973 ; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014
974 ; GFX7-NEXT: v_mov_b32_e32 v2, s19
975 ; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018
976 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v2, v0
977 ; GFX7-NEXT: v_mov_b32_e32 v2, s21
978 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28
979 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v2, v0
980 ; GFX7-NEXT: s_ashr_i32 s0, s0, 28
981 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
982 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v2, v0
983 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
984 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
985 ; GFX7-NEXT: s_endpgm
987 ; GFX8-LABEL: idot8_multiuses_mul1:
988 ; GFX8: ; %bb.0: ; %entry
989 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
990 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
991 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
992 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
993 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
994 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
995 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
996 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
997 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
998 ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
999 ; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
1000 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1001 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
1002 ; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1003 ; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004
1004 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004
1005 ; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008
1006 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1007 ; GFX8-NEXT: v_mov_b32_e32 v4, s7
1008 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v4, v2
1009 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008
1010 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
1011 ; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c
1012 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v4, v2
1013 ; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c
1014 ; GFX8-NEXT: v_mov_b32_e32 v4, s11
1015 ; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010
1016 ; GFX8-NEXT: v_mad_i32_i24 v2, s10, v4, v2
1017 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010
1018 ; GFX8-NEXT: v_mov_b32_e32 v4, s13
1019 ; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014
1020 ; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018
1021 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v4, v2
1022 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014
1023 ; GFX8-NEXT: v_mov_b32_e32 v4, s15
1024 ; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018
1025 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2
1026 ; GFX8-NEXT: v_mov_b32_e32 v4, s17
1027 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28
1028 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2
1029 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28
1030 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
1031 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2
1032 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1033 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1034 ; GFX8-NEXT: s_endpgm
1036 ; GFX9-LABEL: idot8_multiuses_mul1:
1037 ; GFX9: ; %bb.0: ; %entry
1038 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1039 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1040 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1041 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
1042 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
1043 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0
1044 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1045 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1046 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1047 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
1048 ; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
1049 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1050 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
1051 ; GFX9-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1052 ; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004
1053 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004
1054 ; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008
1055 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1056 ; GFX9-NEXT: v_mov_b32_e32 v4, s7
1057 ; GFX9-NEXT: v_mad_i32_i24 v2, s6, v4, v2
1058 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008
1059 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1060 ; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c
1061 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v4, v2
1062 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c
1063 ; GFX9-NEXT: v_mov_b32_e32 v4, s11
1064 ; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010
1065 ; GFX9-NEXT: v_mad_i32_i24 v2, s10, v4, v2
1066 ; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
1067 ; GFX9-NEXT: v_mov_b32_e32 v4, s13
1068 ; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014
1069 ; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018
1070 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v4, v2
1071 ; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
1072 ; GFX9-NEXT: v_mov_b32_e32 v4, s15
1073 ; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
1074 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v4, v2
1075 ; GFX9-NEXT: v_mov_b32_e32 v4, s17
1076 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28
1077 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v4, v2
1078 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28
1079 ; GFX9-NEXT: v_mov_b32_e32 v4, s4
1080 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v4, v2
1081 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
1082 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1083 ; GFX9-NEXT: s_endpgm
1085 ; GFX9-DL-LABEL: idot8_multiuses_mul1:
1086 ; GFX9-DL: ; %bb.0: ; %entry
1087 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1088 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1089 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1090 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1091 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1092 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1093 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1094 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1095 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
1097 ; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
1098 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1
1099 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
1100 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1101 ; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40004
1102 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004
1103 ; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40008
1104 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1105 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7
1106 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v4, v2
1107 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008
1108 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9
1109 ; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x4000c
1110 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v4, v2
1111 ; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c
1112 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11
1113 ; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x40010
1114 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v4, v2
1115 ; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
1116 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13
1117 ; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40014
1118 ; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40018
1119 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v4, v2
1120 ; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
1121 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
1122 ; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
1123 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v4, v2
1124 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17
1125 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28
1126 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v4, v2
1127 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
1128 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
1129 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
1130 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
1131 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1132 ; GFX9-DL-NEXT: s_endpgm
1134 ; GFX10-DL-LABEL: idot8_multiuses_mul1:
1135 ; GFX10-DL: ; %bb.0: ; %entry
1136 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1137 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1138 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1139 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1140 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1141 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1142 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1143 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1144 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1145 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1146 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
1147 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
1148 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1149 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004
1150 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004
1151 ; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40008
1152 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008
1153 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
1154 ; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x4000c
1155 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x4000c
1156 ; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40010
1157 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40010
1158 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v2
1159 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40014
1160 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014
1161 ; GFX10-DL-NEXT: s_bfe_i32 s13, s2, 0x40018
1162 ; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018
1163 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3
1164 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s8, v3
1165 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s9, s10, v3
1166 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s11, s12, v3
1167 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3
1168 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 28
1169 ; GFX10-DL-NEXT: s_ashr_i32 s1, s4, 28
1170 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s13, s14, v3
1171 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3
1172 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
1173 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1174 ; GFX10-DL-NEXT: s_endpgm
1175 <8 x i4> addrspace(1)* %src2,
1176 i32 addrspace(1)* nocapture %dst) {
1178 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1179 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1181 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1182 %cv1e0 = sext i4 %v1e0 to i32
1183 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1184 %cv2e0 = sext i4 %v2e0 to i32
1185 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1187 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1188 %cv1e1 = sext i4 %v1e1 to i32
1189 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1190 %cv2e1 = sext i4 %v2e1 to i32
1191 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1193 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1194 %cv1e2 = sext i4 %v1e2 to i32
1195 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1196 %cv2e2 = sext i4 %v2e2 to i32
1197 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1199 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1200 %cv1e3 = sext i4 %v1e3 to i32
1201 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1202 %cv2e3 = sext i4 %v2e3 to i32
1203 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1205 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1206 %cv1e4 = sext i4 %v1e4 to i32
1207 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1208 %cv2e4 = sext i4 %v2e4 to i32
1209 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1211 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1212 %cv1e5 = sext i4 %v1e5 to i32
1213 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1214 %cv2e5 = sext i4 %v2e5 to i32
1215 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1217 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1218 %cv1e6 = sext i4 %v1e6 to i32
1219 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1220 %cv2e6 = sext i4 %v2e6 to i32
1221 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1223 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1224 %cv1e7 = sext i4 %v1e7 to i32
1225 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1226 %cv2e7 = sext i4 %v2e7 to i32
1227 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1229 %acc = load i32, i32 addrspace(1)* %dst, align 4
1230 %add = add i32 %mul0, %acc
1231 %add1 = add i32 %mul0, %add
1232 %add2 = add i32 %add1, %mul1
1233 %add3 = add i32 %add2, %mul2
1234 %add4 = add i32 %add3, %mul3
1235 %add5 = add i32 %add4, %mul4
1236 %add6 = add i32 %add5, %mul5
1237 %add7 = add i32 %add6, %mul6
1238 %add8 = add i32 %add7, %mul7
1240 %res = add i32 %add, %add8
1241 store i32 %res, i32 addrspace(1)* %dst, align 4
1245 ; TODO: Support this pattern.
1246 define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1247 ; GFX7-LABEL: idot8_acc32_vecMul:
1248 ; GFX7: ; %bb.0: ; %entry
1249 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1250 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1251 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1252 ; GFX7-NEXT: s_mov_b32 s6, -1
1253 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1254 ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0
1255 ; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0
1256 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
1257 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1258 ; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60
1259 ; GFX7-NEXT: s_lshl_b32 s11, s1, 4
1260 ; GFX7-NEXT: s_ashr_i64 s[14:15], s[10:11], 60
1261 ; GFX7-NEXT: s_lshl_b32 s11, s1, 12
1262 ; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60
1263 ; GFX7-NEXT: s_lshl_b32 s11, s1, 16
1264 ; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60
1265 ; GFX7-NEXT: s_lshl_b32 s11, s1, 20
1266 ; GFX7-NEXT: s_lshl_b32 s13, s1, 8
1267 ; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60
1268 ; GFX7-NEXT: s_lshl_b32 s11, s1, 24
1269 ; GFX7-NEXT: s_lshl_b32 s1, s1, 28
1270 ; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1271 ; GFX7-NEXT: s_lshl_b32 s1, s9, 4
1272 ; GFX7-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1273 ; GFX7-NEXT: s_lshl_b32 s1, s9, 8
1274 ; GFX7-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1275 ; GFX7-NEXT: s_lshl_b32 s1, s9, 12
1276 ; GFX7-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1277 ; GFX7-NEXT: s_lshl_b32 s1, s9, 16
1278 ; GFX7-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1279 ; GFX7-NEXT: s_lshl_b32 s1, s9, 20
1280 ; GFX7-NEXT: s_ashr_i64 s[34:35], s[0:1], 60
1281 ; GFX7-NEXT: s_lshl_b32 s1, s9, 24
1282 ; GFX7-NEXT: s_ashr_i64 s[36:37], s[0:1], 60
1283 ; GFX7-NEXT: s_lshl_b32 s1, s9, 28
1284 ; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60
1285 ; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60
1286 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
1287 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
1288 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1
1289 ; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60
1290 ; GFX7-NEXT: v_mov_b32_e32 v1, s36
1291 ; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0
1292 ; GFX7-NEXT: v_mov_b32_e32 v1, s34
1293 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0
1294 ; GFX7-NEXT: v_mov_b32_e32 v1, s32
1295 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0
1296 ; GFX7-NEXT: v_mov_b32_e32 v1, s30
1297 ; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0
1298 ; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
1299 ; GFX7-NEXT: v_mov_b32_e32 v1, s28
1300 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0
1301 ; GFX7-NEXT: v_mov_b32_e32 v1, s26
1302 ; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0
1303 ; GFX7-NEXT: v_mov_b32_e32 v1, s24
1304 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0
1305 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1306 ; GFX7-NEXT: s_endpgm
1308 ; GFX8-LABEL: idot8_acc32_vecMul:
1309 ; GFX8: ; %bb.0: ; %entry
1310 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1311 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1312 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1313 ; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0
1314 ; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0
1315 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1316 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1317 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1318 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1319 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1320 ; GFX8-NEXT: s_lshl_b32 s1, s5, 4
1321 ; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1322 ; GFX8-NEXT: s_lshl_b32 s1, s5, 16
1323 ; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60
1324 ; GFX8-NEXT: s_lshl_b32 s1, s5, 20
1325 ; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60
1326 ; GFX8-NEXT: s_lshl_b32 s1, s5, 24
1327 ; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60
1328 ; GFX8-NEXT: s_lshl_b32 s1, s5, 28
1329 ; GFX8-NEXT: s_lshl_b32 s9, s5, 8
1330 ; GFX8-NEXT: s_lshl_b32 s11, s5, 12
1331 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
1332 ; GFX8-NEXT: s_lshl_b32 s1, s7, 4
1333 ; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60
1334 ; GFX8-NEXT: s_lshl_b32 s1, s7, 8
1335 ; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60
1336 ; GFX8-NEXT: s_lshl_b32 s1, s7, 12
1337 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1338 ; GFX8-NEXT: s_lshl_b32 s1, s7, 16
1339 ; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1340 ; GFX8-NEXT: s_lshl_b32 s1, s7, 20
1341 ; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1342 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24
1343 ; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1344 ; GFX8-NEXT: s_lshl_b32 s1, s7, 28
1345 ; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60
1346 ; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60
1347 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
1348 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
1349 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3
1350 ; GFX8-NEXT: v_mov_b32_e32 v3, s32
1351 ; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2
1352 ; GFX8-NEXT: v_mov_b32_e32 v3, s30
1353 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2
1354 ; GFX8-NEXT: v_mov_b32_e32 v3, s28
1355 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
1356 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1357 ; GFX8-NEXT: v_mov_b32_e32 v3, s26
1358 ; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2
1359 ; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1360 ; GFX8-NEXT: v_mov_b32_e32 v3, s24
1361 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2
1362 ; GFX8-NEXT: v_mov_b32_e32 v3, s22
1363 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2
1364 ; GFX8-NEXT: v_mov_b32_e32 v3, s20
1365 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
1366 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1367 ; GFX8-NEXT: s_endpgm
1369 ; GFX9-LABEL: idot8_acc32_vecMul:
1370 ; GFX9: ; %bb.0: ; %entry
1371 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1372 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1374 ; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0
1375 ; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0
1376 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
1377 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1378 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1379 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1380 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1381 ; GFX9-NEXT: s_lshl_b32 s1, s5, 4
1382 ; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1383 ; GFX9-NEXT: s_lshl_b32 s1, s5, 16
1384 ; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60
1385 ; GFX9-NEXT: s_lshl_b32 s1, s5, 20
1386 ; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60
1387 ; GFX9-NEXT: s_lshl_b32 s1, s5, 24
1388 ; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60
1389 ; GFX9-NEXT: s_lshl_b32 s1, s5, 28
1390 ; GFX9-NEXT: s_lshl_b32 s9, s5, 8
1391 ; GFX9-NEXT: s_lshl_b32 s11, s5, 12
1392 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
1393 ; GFX9-NEXT: s_lshl_b32 s1, s7, 4
1394 ; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60
1395 ; GFX9-NEXT: s_lshl_b32 s1, s7, 8
1396 ; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60
1397 ; GFX9-NEXT: s_lshl_b32 s1, s7, 12
1398 ; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1399 ; GFX9-NEXT: s_lshl_b32 s1, s7, 16
1400 ; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1401 ; GFX9-NEXT: s_lshl_b32 s1, s7, 20
1402 ; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1403 ; GFX9-NEXT: s_lshl_b32 s1, s7, 24
1404 ; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1405 ; GFX9-NEXT: s_lshl_b32 s1, s7, 28
1406 ; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60
1407 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60
1408 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1409 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1410 ; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3
1411 ; GFX9-NEXT: v_mov_b32_e32 v3, s32
1412 ; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2
1413 ; GFX9-NEXT: v_mov_b32_e32 v3, s30
1414 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2
1415 ; GFX9-NEXT: v_mov_b32_e32 v3, s28
1416 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2
1417 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1418 ; GFX9-NEXT: v_mov_b32_e32 v3, s26
1419 ; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2
1420 ; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1421 ; GFX9-NEXT: v_mov_b32_e32 v3, s24
1422 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2
1423 ; GFX9-NEXT: v_mov_b32_e32 v3, s22
1424 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2
1425 ; GFX9-NEXT: v_mov_b32_e32 v3, s20
1426 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
1427 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1428 ; GFX9-NEXT: s_endpgm
1430 ; GFX9-DL-LABEL: idot8_acc32_vecMul:
1431 ; GFX9-DL: ; %bb.0: ; %entry
1432 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1433 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1434 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1435 ; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0
1436 ; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0
1437 ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1438 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1439 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1440 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1441 ; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1442 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4
1443 ; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1444 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16
1445 ; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60
1446 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20
1447 ; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60
1448 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24
1449 ; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60
1450 ; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28
1451 ; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8
1452 ; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12
1453 ; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
1454 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4
1455 ; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60
1456 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8
1457 ; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60
1458 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12
1459 ; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60
1460 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16
1461 ; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60
1462 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20
1463 ; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60
1464 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24
1465 ; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60
1466 ; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28
1467 ; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60
1468 ; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60
1469 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
1470 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
1471 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3
1472 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32
1473 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2
1474 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30
1475 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2
1476 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28
1477 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2
1478 ; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1479 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26
1480 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2
1481 ; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1482 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24
1483 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2
1484 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22
1485 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2
1486 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20
1487 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
1488 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1489 ; GFX9-DL-NEXT: s_endpgm
1491 ; GFX10-DL-LABEL: idot8_acc32_vecMul:
1492 ; GFX10-DL: ; %bb.0: ; %entry
1493 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1494 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1495 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1496 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1497 ; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0
1498 ; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0
1499 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1500 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1501 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1502 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1503 ; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 28
1504 ; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 28
1505 ; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 24
1506 ; GFX10-DL-NEXT: s_lshl_b32 s13, s7, 24
1507 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
1508 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1509 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1510 ; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 20
1511 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1512 ; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 20
1513 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
1514 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2
1515 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1516 ; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 16
1517 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1518 ; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 16
1519 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2
1520 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12
1521 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1522 ; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12
1523 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1524 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2
1525 ; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 8
1526 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1527 ; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[10:11], 60
1528 ; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 8
1529 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2
1530 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1531 ; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 4
1532 ; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 4
1533 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[8:9], 60
1534 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s14, v2
1535 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[10:11], 60
1536 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[0:1], 60
1537 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s12, v2
1538 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1539 ; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[6:7], 60
1540 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s10, v2
1541 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s4, v2
1542 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1543 ; GFX10-DL-NEXT: s_endpgm
1544 <8 x i4> addrspace(1)* %src2,
1545 i32 addrspace(1)* nocapture %dst) {
1547 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1548 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1550 %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
1551 %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
1553 %mul = mul <8 x i32> %cvec1, %cvec2
1554 %mul0 = extractelement <8 x i32> %mul, i64 0
1555 %mul1 = extractelement <8 x i32> %mul, i64 1
1556 %mul2 = extractelement <8 x i32> %mul, i64 2
1557 %mul3 = extractelement <8 x i32> %mul, i64 3
1558 %mul4 = extractelement <8 x i32> %mul, i64 4
1559 %mul5 = extractelement <8 x i32> %mul, i64 5
1560 %mul6 = extractelement <8 x i32> %mul, i64 6
1561 %mul7 = extractelement <8 x i32> %mul, i64 7
1563 %acc = load i32, i32 addrspace(1)* %dst, align 4
1564 %add1 = add i32 %mul0, %acc
1565 %add2 = add i32 %add1, %mul1
1566 %add3 = add i32 %add2, %mul2
1567 %add4 = add i32 %add3, %mul3
1568 %add5 = add i32 %add4, %mul4
1569 %add6 = add i32 %add5, %mul5
1570 %add7 = add i32 %add6, %mul6
1571 %add8 = add i32 %add7, %mul7
1573 store i32 %add8, i32 addrspace(1)* %dst, align 4
1577 ; TODO: Support this pattern.
1578 define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
1579 ; GFX7-LABEL: idot8_acc16_vecMul:
1580 ; GFX7: ; %bb.0: ; %entry
1581 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1582 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1583 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1584 ; GFX7-NEXT: s_mov_b32 s6, -1
1585 ; GFX7-NEXT: s_mov_b32 s2, 0xffff
1586 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1587 ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
1588 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
1589 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0
1590 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1591 ; GFX7-NEXT: s_ashr_i32 s8, s0, 28
1592 ; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40018
1593 ; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40018
1594 ; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40014
1595 ; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40010
1596 ; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40000
1597 ; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40004
1598 ; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40008
1599 ; GFX7-NEXT: s_ashr_i32 s15, s1, 28
1600 ; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c
1601 ; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40014
1602 ; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40010
1603 ; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40000
1604 ; GFX7-NEXT: v_mov_b32_e32 v4, s19
1605 ; GFX7-NEXT: s_bfe_i32 s13, s0, 0x40004
1606 ; GFX7-NEXT: v_mov_b32_e32 v3, s20
1607 ; GFX7-NEXT: s_bfe_i32 s14, s0, 0x40008
1608 ; GFX7-NEXT: v_mov_b32_e32 v2, s21
1609 ; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c
1610 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1611 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s0, v1
1612 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2
1613 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3
1614 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4
1615 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1616 ; GFX7-NEXT: v_and_b32_e32 v2, s2, v2
1617 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1618 ; GFX7-NEXT: v_and_b32_e32 v4, s2, v4
1619 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
1620 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
1621 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16
1622 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1623 ; GFX7-NEXT: v_mov_b32_e32 v5, s18
1624 ; GFX7-NEXT: v_mov_b32_e32 v6, s17
1625 ; GFX7-NEXT: v_mov_b32_e32 v7, s16
1626 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1627 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1628 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
1629 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1630 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
1631 ; GFX7-NEXT: v_mad_i32_i24 v0, s11, v5, v0
1632 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v6, v0
1633 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v7, v0
1634 ; GFX7-NEXT: v_mov_b32_e32 v1, s15
1635 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
1636 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
1637 ; GFX7-NEXT: s_endpgm
1639 ; GFX8-LABEL: idot8_acc16_vecMul:
1640 ; GFX8: ; %bb.0: ; %entry
1641 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1642 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1643 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1644 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1645 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1646 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1647 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1648 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
1649 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1650 ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2
1651 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4
1652 ; GFX8-NEXT: s_lshr_b32 s0, s2, 4
1653 ; GFX8-NEXT: s_lshr_b32 s1, s2, 8
1654 ; GFX8-NEXT: s_lshr_b32 s5, s4, 4
1655 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8
1656 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1
1657 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0
1658 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6
1659 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
1660 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
1661 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
1662 ; GFX8-NEXT: s_lshr_b32 s0, s2, 12
1663 ; GFX8-NEXT: s_lshr_b32 s1, s4, 12
1664 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1665 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1666 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1667 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1668 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0
1669 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1
1670 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16
1671 ; GFX8-NEXT: s_lshr_b32 s6, s4, 16
1672 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7
1673 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5
1674 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6
1675 ; GFX8-NEXT: s_lshr_b32 s0, s2, 20
1676 ; GFX8-NEXT: s_lshr_b32 s1, s4, 20
1677 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1678 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1679 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s0
1680 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s1
1681 ; GFX8-NEXT: s_lshr_b32 s5, s2, 24
1682 ; GFX8-NEXT: s_lshr_b32 s6, s4, 24
1683 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1684 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1685 ; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5
1686 ; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6
1687 ; GFX8-NEXT: s_lshr_b32 s0, s2, 28
1688 ; GFX8-NEXT: s_lshr_b32 s1, s4, 28
1689 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1690 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1691 ; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0
1692 ; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1
1693 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
1694 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
1695 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
1696 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
1697 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1698 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
1699 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2
1700 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1701 ; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2
1702 ; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2
1703 ; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2
1704 ; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2
1705 ; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2
1706 ; GFX8-NEXT: flat_store_short v[0:1], v2
1707 ; GFX8-NEXT: s_endpgm
1709 ; GFX9-LABEL: idot8_acc16_vecMul:
1710 ; GFX9: ; %bb.0: ; %entry
1711 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1712 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1713 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1714 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
1715 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
1716 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1717 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1718 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off
1719 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1720 ; GFX9-NEXT: s_and_b32 s0, s2, 15
1721 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
1722 ; GFX9-NEXT: s_and_b32 s5, s4, 15
1723 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004
1724 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
1725 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6
1726 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
1727 ; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
1728 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1729 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6
1730 ; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010
1731 ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014
1732 ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
1733 ; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018
1734 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10
1735 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28
1736 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
1737 ; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
1738 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2
1739 ; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008
1740 ; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c
1741 ; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1742 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8
1743 ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1744 ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
1745 ; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1]
1746 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7
1747 ; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010
1748 ; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014
1749 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12
1750 ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1751 ; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
1752 ; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1]
1753 ; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018
1754 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28
1755 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
1756 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4
1757 ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
1758 ; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
1759 ; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1]
1760 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9
1761 ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
1762 ; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1]
1763 ; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10
1764 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1765 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
1766 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1767 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1768 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1769 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
1770 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1771 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
1772 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1773 ; GFX9-NEXT: global_store_short v[0:1], v2, off
1774 ; GFX9-NEXT: s_endpgm
1776 ; GFX9-DL-LABEL: idot8_acc16_vecMul:
1777 ; GFX9-DL: ; %bb.0: ; %entry
1778 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1779 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1780 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1781 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1782 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1783 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1784 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1785 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
1786 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1787 ; GFX9-DL-NEXT: s_and_b32 s0, s2, 15
1788 ; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
1789 ; GFX9-DL-NEXT: s_and_b32 s5, s4, 15
1790 ; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1791 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
1792 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6
1793 ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
1794 ; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
1795 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1796 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6
1797 ; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010
1798 ; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014
1799 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
1800 ; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018
1801 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10
1802 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
1803 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
1804 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
1805 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2
1806 ; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008
1807 ; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c
1808 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1809 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8
1810 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1811 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
1812 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1]
1813 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7
1814 ; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010
1815 ; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014
1816 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12
1817 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1818 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
1819 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1]
1820 ; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018
1821 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
1822 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
1823 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4
1824 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
1825 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
1826 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1]
1827 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9
1828 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
1829 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1]
1830 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10
1831 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1832 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
1833 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1834 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1835 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1836 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
1837 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1838 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6
1839 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1840 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
1841 ; GFX9-DL-NEXT: s_endpgm
1843 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
1844 ; GFX10-DL: ; %bb.0: ; %entry
1845 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1846 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1847 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1848 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1849 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1850 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1851 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1852 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1853 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
1854 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1855 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
1856 ; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
1857 ; GFX10-DL-NEXT: s_and_b32 s5, s4, 15
1858 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1859 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
1860 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
1861 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
1862 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008
1863 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
1864 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c
1865 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8
1866 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1867 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010
1868 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1]
1869 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6
1870 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014
1871 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1872 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40010
1873 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1874 ; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014
1875 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1]
1876 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1]
1877 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s5
1878 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
1879 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s8
1880 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
1881 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
1882 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
1883 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018
1884 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
1885 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1886 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
1887 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
1888 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
1889 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s2
1890 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
1891 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4
1892 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
1893 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
1894 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1]
1895 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
1896 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
1897 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1]
1898 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v6, v7
1899 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1900 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
1901 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1902 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1903 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1904 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5
1905 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1906 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7
1907 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1908 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
1909 ; GFX10-DL-NEXT: s_endpgm
1910 <8 x i4> addrspace(1)* %src2,
1911 i16 addrspace(1)* nocapture %dst) {
1913 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
1914 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
1916 %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
1917 %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
1919 %mul = mul <8 x i16> %cvec1, %cvec2
1920 %mul0 = extractelement <8 x i16> %mul, i64 0
1921 %mul1 = extractelement <8 x i16> %mul, i64 1
1922 %mul2 = extractelement <8 x i16> %mul, i64 2
1923 %mul3 = extractelement <8 x i16> %mul, i64 3
1924 %mul4 = extractelement <8 x i16> %mul, i64 4
1925 %mul5 = extractelement <8 x i16> %mul, i64 5
1926 %mul6 = extractelement <8 x i16> %mul, i64 6
1927 %mul7 = extractelement <8 x i16> %mul, i64 7
1929 %acc = load i16, i16 addrspace(1)* %dst, align 4
1930 %add1 = add i16 %mul0, %acc
1931 %add2 = add i16 %add1, %mul1
1932 %add3 = add i16 %add2, %mul2
1933 %add4 = add i16 %add3, %mul3
1934 %add5 = add i16 %add4, %mul4
1935 %add6 = add i16 %add5, %mul5
1936 %add7 = add i16 %add6, %mul6
1937 %add8 = add i16 %add7, %mul7
1939 store i16 %add8, i16 addrspace(1)* %dst, align 4
1943 ; TODO: Support this pattern.
1944 define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
1945 ; GFX7-LABEL: idot8_acc8_vecMul:
1946 ; GFX7: ; %bb.0: ; %entry
1947 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1948 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1949 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1950 ; GFX7-NEXT: s_mov_b32 s6, -1
1951 ; GFX7-NEXT: s_movk_i32 s0, 0xff
1952 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1953 ; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
1954 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
1955 ; GFX7-NEXT: s_load_dword s8, s[10:11], 0x0
1956 ; GFX7-NEXT: s_mov_b32 s1, 0xffff
1957 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1958 ; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000
1959 ; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004
1960 ; GFX7-NEXT: s_bfe_i32 s16, s8, 0x40000
1961 ; GFX7-NEXT: s_bfe_i32 s17, s8, 0x40004
1962 ; GFX7-NEXT: s_bfe_i32 s18, s8, 0x40008
1963 ; GFX7-NEXT: s_bfe_i32 s19, s8, 0x4000c
1964 ; GFX7-NEXT: s_bfe_i32 s20, s8, 0x40010
1965 ; GFX7-NEXT: s_bfe_i32 s21, s8, 0x40014
1966 ; GFX7-NEXT: s_bfe_i32 s22, s8, 0x40018
1967 ; GFX7-NEXT: s_ashr_i32 s8, s8, 28
1968 ; GFX7-NEXT: v_mov_b32_e32 v7, s17
1969 ; GFX7-NEXT: v_mov_b32_e32 v8, s16
1970 ; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40008
1971 ; GFX7-NEXT: v_mov_b32_e32 v6, s18
1972 ; GFX7-NEXT: s_bfe_i32 s12, s2, 0x4000c
1973 ; GFX7-NEXT: v_mov_b32_e32 v5, s19
1974 ; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40010
1975 ; GFX7-NEXT: v_mov_b32_e32 v4, s20
1976 ; GFX7-NEXT: s_bfe_i32 s14, s2, 0x40014
1977 ; GFX7-NEXT: v_mov_b32_e32 v3, s21
1978 ; GFX7-NEXT: s_bfe_i32 s15, s2, 0x40018
1979 ; GFX7-NEXT: v_mov_b32_e32 v2, s22
1980 ; GFX7-NEXT: s_ashr_i32 s2, s2, 28
1981 ; GFX7-NEXT: v_mov_b32_e32 v1, s8
1982 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s2, v1
1983 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s15, v2
1984 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s14, v3
1985 ; GFX7-NEXT: v_mul_i32_i24_e32 v9, s13, v4
1986 ; GFX7-NEXT: v_mul_i32_i24_e32 v5, s12, v5
1987 ; GFX7-NEXT: v_mul_i32_i24_e32 v6, s11, v6
1988 ; GFX7-NEXT: v_mul_i32_i24_e32 v7, s10, v7
1989 ; GFX7-NEXT: v_mul_i32_i24_e32 v8, s9, v8
1990 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1991 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2
1992 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1993 ; GFX7-NEXT: v_and_b32_e32 v9, s0, v9
1994 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
1995 ; GFX7-NEXT: v_and_b32_e32 v6, s0, v6
1996 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7
1997 ; GFX7-NEXT: v_and_b32_e32 v8, s0, v8
1998 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
1999 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v3
2000 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v5
2001 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v7
2002 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2003 ; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
2004 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2005 ; GFX7-NEXT: v_and_b32_e32 v5, s1, v5
2006 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
2007 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v3
2008 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8
2009 ; GFX7-NEXT: v_alignbit_b32 v5, v1, v2, 16
2010 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
2011 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1
2012 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1
2013 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
2014 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2015 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2016 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
2017 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0
2018 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0
2019 ; GFX7-NEXT: v_mad_i32_i24 v0, s13, v4, v0
2020 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7
2021 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8
2022 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2023 ; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0
2024 ; GFX7-NEXT: s_endpgm
2026 ; GFX8-LABEL: idot8_acc8_vecMul:
2027 ; GFX8: ; %bb.0: ; %entry
2028 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2029 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2030 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2031 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2032 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2033 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
2034 ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
2035 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
2036 ; GFX8-NEXT: s_mov_b32 s0, 0xffff
2037 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2038 ; GFX8-NEXT: s_lshr_b32 s8, s1, 4
2039 ; GFX8-NEXT: s_lshr_b32 s9, s1, 12
2040 ; GFX8-NEXT: s_lshr_b32 s10, s1, 8
2041 ; GFX8-NEXT: s_lshr_b32 s15, s2, 4
2042 ; GFX8-NEXT: s_lshr_b32 s16, s2, 12
2043 ; GFX8-NEXT: s_lshr_b32 s17, s2, 8
2044 ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1
2045 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
2046 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10
2047 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9
2048 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8
2049 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17
2050 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16
2051 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15
2052 ; GFX8-NEXT: s_lshr_b32 s4, s1, 20
2053 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16
2054 ; GFX8-NEXT: s_lshr_b32 s6, s1, 28
2055 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24
2056 ; GFX8-NEXT: s_lshr_b32 s11, s2, 20
2057 ; GFX8-NEXT: s_lshr_b32 s12, s2, 16
2058 ; GFX8-NEXT: s_lshr_b32 s13, s2, 28
2059 ; GFX8-NEXT: s_lshr_b32 s14, s2, 24
2060 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7
2061 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6
2062 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5
2063 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4
2064 ; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s14
2065 ; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s13
2066 ; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s12
2067 ; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s11
2068 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2069 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
2070 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2071 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2072 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2073 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2074 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2075 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2076 ; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2077 ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2078 ; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2079 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2080 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2081 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2082 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2083 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2084 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2085 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
2086 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2087 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
2088 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2089 ; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2090 ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2091 ; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2092 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2093 ; GFX8-NEXT: v_and_b32_e32 v3, s0, v3
2094 ; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2095 ; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2096 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2097 ; GFX8-NEXT: v_and_b32_e32 v5, s0, v9
2098 ; GFX8-NEXT: v_or_b32_e32 v4, v3, v4
2099 ; GFX8-NEXT: v_or_b32_e32 v6, v5, v7
2100 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4
2101 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6
2102 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2103 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
2104 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
2105 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
2106 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2107 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
2108 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
2109 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2110 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2111 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2112 ; GFX8-NEXT: s_endpgm
2114 ; GFX9-LABEL: idot8_acc8_vecMul:
2115 ; GFX9: ; %bb.0: ; %entry
2116 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2117 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2118 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
2119 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2120 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2121 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2122 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
2123 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
2124 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
2125 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2126 ; GFX9-NEXT: s_lshr_b32 s8, s0, 4
2127 ; GFX9-NEXT: s_lshr_b32 s15, s1, 4
2128 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0
2129 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
2130 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8
2131 ; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15
2132 ; GFX9-NEXT: s_lshr_b32 s9, s0, 12
2133 ; GFX9-NEXT: s_lshr_b32 s10, s0, 8
2134 ; GFX9-NEXT: s_lshr_b32 s16, s1, 12
2135 ; GFX9-NEXT: s_lshr_b32 s17, s1, 8
2136 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10
2137 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9
2138 ; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17
2139 ; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16
2140 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2141 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
2142 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2143 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2144 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2145 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2146 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2147 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2148 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4
2149 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2150 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2151 ; GFX9-NEXT: s_lshr_b32 s4, s0, 20
2152 ; GFX9-NEXT: s_lshr_b32 s5, s0, 16
2153 ; GFX9-NEXT: s_lshr_b32 s11, s1, 20
2154 ; GFX9-NEXT: s_lshr_b32 s12, s1, 16
2155 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2156 ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
2157 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5
2158 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s4
2159 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12
2160 ; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s11
2161 ; GFX9-NEXT: s_lshr_b32 s6, s0, 28
2162 ; GFX9-NEXT: s_lshr_b32 s7, s0, 24
2163 ; GFX9-NEXT: s_lshr_b32 s13, s1, 28
2164 ; GFX9-NEXT: s_lshr_b32 s14, s1, 24
2165 ; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
2166 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2167 ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7
2168 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6
2169 ; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14
2170 ; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13
2171 ; GFX9-NEXT: v_or_b32_e32 v5, v3, v5
2172 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2173 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2174 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2175 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v18
2176 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2177 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2178 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2179 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16
2180 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2181 ; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17
2182 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5
2183 ; GFX9-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2184 ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2185 ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15
2186 ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2187 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
2188 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v8
2189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2190 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
2191 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v7
2192 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2193 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2194 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
2195 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v6
2196 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
2197 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2198 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2199 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
2200 ; GFX9-NEXT: s_endpgm
2202 ; GFX9-DL-LABEL: idot8_acc8_vecMul:
2203 ; GFX9-DL: ; %bb.0: ; %entry
2204 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2205 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2206 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
2207 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2208 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2209 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2210 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
2211 ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
2212 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
2213 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2214 ; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 4
2215 ; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 4
2216 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
2217 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
2218 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8
2219 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15
2220 ; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 12
2221 ; GFX9-DL-NEXT: s_lshr_b32 s10, s0, 8
2222 ; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 12
2223 ; GFX9-DL-NEXT: s_lshr_b32 s17, s1, 8
2224 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10
2225 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9
2226 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17
2227 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16
2228 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2229 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
2230 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2231 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2232 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2233 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2234 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2235 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2236 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4
2237 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2238 ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2239 ; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 20
2240 ; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16
2241 ; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 20
2242 ; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 16
2243 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2244 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
2245 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5
2246 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4
2247 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12
2248 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s11
2249 ; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28
2250 ; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 24
2251 ; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28
2252 ; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 24
2253 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
2254 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2255 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7
2256 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
2257 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14
2258 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13
2259 ; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5
2260 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2261 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2262 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2263 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v18
2264 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2265 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2266 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2267 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16
2268 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2269 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17
2270 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5
2271 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2272 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2273 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15
2274 ; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2275 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
2276 ; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8
2277 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2278 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
2279 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7
2280 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2281 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2282 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
2283 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v6
2284 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
2285 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2286 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2287 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
2288 ; GFX9-DL-NEXT: s_endpgm
2290 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
2291 ; GFX10-DL: ; %bb.0: ; %entry
2292 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
2293 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
2294 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
2295 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2296 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2297 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
2298 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
2299 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2300 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
2301 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2302 ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
2303 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
2304 ; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
2305 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2306 ; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4
2307 ; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4
2308 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
2309 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
2310 ; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8
2311 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s9
2312 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s16
2313 ; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 12
2314 ; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8
2315 ; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 12
2316 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10
2317 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
2318 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
2319 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2
2320 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s18
2321 ; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2
2322 ; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24
2323 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v23, 12, s11
2324 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s17
2325 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2
2326 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
2327 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2
2328 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2
2329 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
2330 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8
2331 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15
2332 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s7
2333 ; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2
2334 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v23, 12, v6
2335 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7
2336 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
2337 ; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2
2338 ; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16
2339 ; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 20
2340 ; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16
2341 ; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 20
2342 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
2343 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
2344 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v27, 12, v14
2345 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2
2346 ; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2
2347 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s6
2348 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5
2349 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s5
2350 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s12
2351 ; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 28
2352 ; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24
2353 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 28
2354 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v35, 12, s13
2355 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2
2356 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2
2357 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v27, v2
2358 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2
2359 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v15
2360 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8
2361 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s15
2362 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v7, v5
2363 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s14
2364 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2
2365 ; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2
2366 ; GFX10-DL-NEXT: v_and_b32_e32 v18, v35, v2
2367 ; GFX10-DL-NEXT: v_and_b32_e32 v19, v19, v2
2368 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v6, v13
2369 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2370 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2371 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2
2372 ; GFX10-DL-NEXT: v_and_b32_e32 v16, v16, v2
2373 ; GFX10-DL-NEXT: v_and_b32_e32 v17, v17, v2
2374 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
2375 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2376 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
2377 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v35, 12, v18
2378 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v19
2379 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2380 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v15, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2381 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9
2382 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v31, 12, v10
2383 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v16, 12, v16
2384 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v11, v2
2385 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2386 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
2387 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v17, 12, v17
2388 ; GFX10-DL-NEXT: v_and_b32_e32 v10, v12, v2
2389 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v19, v2
2390 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v35, v2
2391 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v9, v2
2392 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v16, v2
2393 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v31, v2
2394 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v10, v11
2395 ; GFX10-DL-NEXT: v_and_b32_e32 v12, v17, v2
2396 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v7, v6
2397 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5
2398 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13
2399 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v9, v12
2400 ; GFX10-DL-NEXT: v_and_b32_sdwa v9, v10, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2401 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5
2402 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2403 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2404 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
2405 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2406 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2407 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10
2408 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2409 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2410 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2411 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
2412 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2413 ; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2
2414 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
2415 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2
2416 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
2417 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2418 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2419 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
2420 ; GFX10-DL-NEXT: s_endpgm
2421 <8 x i4> addrspace(1)* %src2,
2422 i8 addrspace(1)* nocapture %dst) {
2424 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
2425 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
2427 %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
2428 %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
2430 %mul = mul <8 x i8> %cvec1, %cvec2
2431 %mul0 = extractelement <8 x i8> %mul, i64 0
2432 %mul1 = extractelement <8 x i8> %mul, i64 1
2433 %mul2 = extractelement <8 x i8> %mul, i64 2
2434 %mul3 = extractelement <8 x i8> %mul, i64 3
2435 %mul4 = extractelement <8 x i8> %mul, i64 4
2436 %mul5 = extractelement <8 x i8> %mul, i64 5
2437 %mul6 = extractelement <8 x i8> %mul, i64 6
2438 %mul7 = extractelement <8 x i8> %mul, i64 7
2440 %acc = load i8, i8 addrspace(1)* %dst, align 4
2441 %add1 = add i8 %mul0, %acc
2442 %add2 = add i8 %add1, %mul1
2443 %add3 = add i8 %add2, %mul2
2444 %add4 = add i8 %add3, %mul3
2445 %add5 = add i8 %add4, %mul4
2446 %add6 = add i8 %add5, %mul5
2447 %add7 = add i8 %add6, %mul6
2448 %add8 = add i8 %add7, %mul7
2450 store i8 %add8, i8 addrspace(1)* %dst, align 4