1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
7 define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
8 ; GFX7-LABEL: idot4_acc32:
9 ; GFX7: ; %bb.0: ; %entry
10 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
11 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
12 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
13 ; GFX7-NEXT: s_mov_b32 s2, -1
14 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
16 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
17 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
18 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19 ; GFX7-NEXT: s_sext_i32_i8 s7, s4
20 ; GFX7-NEXT: s_sext_i32_i8 s8, s5
21 ; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008
22 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
23 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
24 ; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
25 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v0, v1
26 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
27 ; GFX7-NEXT: v_mov_b32_e32 v1, s10
28 ; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010
29 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0
30 ; GFX7-NEXT: v_mov_b32_e32 v1, s12
31 ; GFX7-NEXT: s_ashr_i32 s5, s5, 24
32 ; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0
33 ; GFX7-NEXT: s_ashr_i32 s4, s4, 24
34 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
35 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0
36 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
39 ; GFX8-LABEL: idot4_acc32:
40 ; GFX8: ; %bb.0: ; %entry
41 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
42 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
43 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
44 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
45 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
46 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
47 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
48 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
49 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX8-NEXT: s_sext_i32_i8 s0, s2
51 ; GFX8-NEXT: s_sext_i32_i8 s1, s3
52 ; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008
53 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
54 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
55 ; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010
56 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
57 ; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
58 ; GFX8-NEXT: v_mov_b32_e32 v3, s6
59 ; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
60 ; GFX8-NEXT: v_mad_i32_i24 v2, s5, v3, v2
61 ; GFX8-NEXT: v_mov_b32_e32 v3, s8
62 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24
63 ; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2
64 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24
65 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
66 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
67 ; GFX8-NEXT: flat_store_dword v[0:1], v2
70 ; GFX9-NODL-LABEL: idot4_acc32:
71 ; GFX9-NODL: ; %bb.0: ; %entry
72 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
73 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
74 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
76 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
77 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
78 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
79 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
80 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2
82 ; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3
83 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008
84 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1
85 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
86 ; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010
87 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
88 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
89 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
90 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
91 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v3, v2
92 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8
93 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
94 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2
95 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
96 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
97 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
98 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
99 ; GFX9-NODL-NEXT: s_endpgm
101 ; GFX9-DL-LABEL: idot4_acc32:
102 ; GFX9-DL: ; %bb.0: ; %entry
103 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
104 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
105 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
107 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
108 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
109 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
110 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
111 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
112 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
113 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
114 ; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v2, v3
115 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
116 ; GFX9-DL-NEXT: s_endpgm
117 <4 x i8> addrspace(1)* %src2,
118 i32 addrspace(1)* nocapture %dst) {
120 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
121 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
123 %v1e0 = extractelement <4 x i8> %vec1, i64 0
124 %cv1e0 = sext i8 %v1e0 to i32
125 %v2e0 = extractelement <4 x i8> %vec2, i64 0
126 %cv2e0 = sext i8 %v2e0 to i32
127 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
129 %v1e1 = extractelement <4 x i8> %vec1, i64 1
130 %cv1e1 = sext i8 %v1e1 to i32
131 %v2e1 = extractelement <4 x i8> %vec2, i64 1
132 %cv2e1 = sext i8 %v2e1 to i32
133 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
135 %v1e2 = extractelement <4 x i8> %vec1, i64 2
136 %cv1e2 = sext i8 %v1e2 to i32
137 %v2e2 = extractelement <4 x i8> %vec2, i64 2
138 %cv2e2 = sext i8 %v2e2 to i32
139 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
141 %v1e3 = extractelement <4 x i8> %vec1, i64 3
142 %cv1e3 = sext i8 %v1e3 to i32
143 %v2e3 = extractelement <4 x i8> %vec2, i64 3
144 %cv2e3 = sext i8 %v2e3 to i32
145 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
147 %acc = load i32, i32 addrspace(1)* %dst, align 4
148 %add1 = add i32 %mul1, %acc
149 %add2 = add i32 %add1, %mul2
150 %add3 = add i32 %add2, %mul3
151 %add4 = add i32 %add3, %mul4
152 store i32 %add4, i32 addrspace(1)* %dst, align 4
156 ; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
157 ; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
158 define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
159 ; GFX7-LABEL: idot4_acc16:
160 ; GFX7: ; %bb.0: ; %entry
161 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
162 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
163 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
164 ; GFX7-NEXT: s_mov_b32 s2, -1
165 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
166 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
168 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
169 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
170 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
171 ; GFX7-NEXT: s_sext_i32_i8 s6, s4
172 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
173 ; GFX7-NEXT: s_sext_i32_i8 s7, s5
174 ; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008
175 ; GFX7-NEXT: s_and_b32 s7, s7, s8
176 ; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
177 ; GFX7-NEXT: s_and_b32 s10, s10, s8
178 ; GFX7-NEXT: s_and_b32 s6, s6, s8
179 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
180 ; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010
181 ; GFX7-NEXT: s_ashr_i32 s5, s5, 24
182 ; GFX7-NEXT: s_and_b32 s12, s12, s8
183 ; GFX7-NEXT: s_and_b32 s9, s9, s8
184 ; GFX7-NEXT: v_mov_b32_e32 v2, s10
185 ; GFX7-NEXT: s_ashr_i32 s4, s4, 24
186 ; GFX7-NEXT: s_and_b32 s11, s11, s8
187 ; GFX7-NEXT: s_and_b32 s5, s5, s8
188 ; GFX7-NEXT: v_mov_b32_e32 v3, s12
189 ; GFX7-NEXT: s_and_b32 s4, s4, s8
190 ; GFX7-NEXT: s_waitcnt vmcnt(0)
191 ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
192 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
193 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
194 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
195 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
196 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
197 ; GFX7-NEXT: s_endpgm
199 ; GFX8-LABEL: idot4_acc16:
200 ; GFX8: ; %bb.0: ; %entry
201 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
202 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
203 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
204 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
205 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
206 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
207 ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
208 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
209 ; GFX8-NEXT: s_mov_b32 s0, 0xffff
210 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8
212 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8
213 ; GFX8-NEXT: s_sext_i32_i8 s4, s2
214 ; GFX8-NEXT: s_bfe_i32 s5, s5, 0x80000
215 ; GFX8-NEXT: s_bfe_i32 s6, s6, 0x80000
216 ; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010
217 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24
218 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
219 ; GFX8-NEXT: s_sext_i32_i8 s3, s1
220 ; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010
221 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24
222 ; GFX8-NEXT: s_and_b32 s4, s0, s5
223 ; GFX8-NEXT: s_and_b32 s5, s0, s6
224 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x80000
225 ; GFX8-NEXT: s_bfe_i32 s2, s2, 0x80000
226 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
227 ; GFX8-NEXT: s_and_b32 s1, s0, s1
228 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
229 ; GFX8-NEXT: s_and_b32 s0, s0, s2
230 ; GFX8-NEXT: s_waitcnt vmcnt(0)
231 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
232 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
233 ; GFX8-NEXT: v_mad_i32_i24 v2, s7, v4, v2
234 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
235 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
236 ; GFX8-NEXT: flat_store_short v[0:1], v2
237 ; GFX8-NEXT: s_endpgm
239 ; GFX9-NODL-LABEL: idot4_acc16:
240 ; GFX9-NODL: ; %bb.0: ; %entry
241 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
242 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
243 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
245 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
246 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
247 ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
248 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
249 ; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
250 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 8
252 ; GFX9-NODL-NEXT: s_lshr_b32 s6, s2, 8
253 ; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
254 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
255 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s6, 0x80000
256 ; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
257 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
258 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
259 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
260 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
261 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
262 ; GFX9-NODL-NEXT: s_and_b32 s4, s0, s5
263 ; GFX9-NODL-NEXT: s_and_b32 s5, s0, s6
264 ; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000
265 ; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
266 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
267 ; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1
268 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
269 ; GFX9-NODL-NEXT: s_and_b32 s0, s0, s2
270 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
271 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
272 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
273 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
274 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
275 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
276 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
277 ; GFX9-NODL-NEXT: s_endpgm
279 ; GFX9-DL-LABEL: idot4_acc16:
280 ; GFX9-DL: ; %bb.0: ; %entry
281 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
282 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
283 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
285 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
286 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
287 ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
288 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
289 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
290 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
291 ; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 8
292 ; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 8
293 ; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2
294 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
295 ; GFX9-DL-NEXT: s_bfe_i32 s6, s6, 0x80000
296 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010
297 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
298 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
299 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
300 ; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010
301 ; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
302 ; GFX9-DL-NEXT: s_and_b32 s4, s0, s5
303 ; GFX9-DL-NEXT: s_and_b32 s5, s0, s6
304 ; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000
305 ; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
306 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
307 ; GFX9-DL-NEXT: s_and_b32 s1, s0, s1
308 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
309 ; GFX9-DL-NEXT: s_and_b32 s0, s0, s2
310 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
311 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
312 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
313 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
314 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
315 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
316 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
317 ; GFX9-DL-NEXT: s_endpgm
318 <4 x i8> addrspace(1)* %src2,
319 i16 addrspace(1)* nocapture %dst) {
321 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
322 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
324 %v1e0 = extractelement <4 x i8> %vec1, i64 0
325 %cv1e0 = sext i8 %v1e0 to i16
326 %v2e0 = extractelement <4 x i8> %vec2, i64 0
327 %cv2e0 = sext i8 %v2e0 to i16
328 %mul1 = mul nsw i16 %cv1e0, %cv2e0
330 %v1e1 = extractelement <4 x i8> %vec1, i64 1
331 %cv1e1 = sext i8 %v1e1 to i16
332 %v2e1 = extractelement <4 x i8> %vec2, i64 1
333 %cv2e1 = sext i8 %v2e1 to i16
334 %mul2 = mul nsw i16 %cv1e1, %cv2e1
336 %v1e2 = extractelement <4 x i8> %vec1, i64 2
337 %cv1e2 = sext i8 %v1e2 to i16
338 %v2e2 = extractelement <4 x i8> %vec2, i64 2
339 %cv2e2 = sext i8 %v2e2 to i16
340 %mul3 = mul nsw i16 %cv1e2, %cv2e2
342 %v1e3 = extractelement <4 x i8> %vec1, i64 3
343 %cv1e3 = sext i8 %v1e3 to i16
344 %v2e3 = extractelement <4 x i8> %vec2, i64 3
345 %cv2e3 = sext i8 %v2e3 to i16
346 %mul4 = mul nsw i16 %cv1e3, %cv2e3
348 %acc = load i16, i16 addrspace(1)* %dst, align 2
349 %add1 = add i16 %mul1, %acc
350 %add2 = add i16 %add1, %mul2
351 %add3 = add i16 %add2, %mul3
352 %add4 = add i16 %add3, %mul4
353 store i16 %add4, i16 addrspace(1)* %dst, align 2
357 define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
358 ; GFX7-LABEL: idot4_acc8:
359 ; GFX7: ; %bb.0: ; %entry
360 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
361 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
362 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
363 ; GFX7-NEXT: s_mov_b32 s2, -1
364 ; GFX7-NEXT: s_movk_i32 s8, 0xff
365 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
367 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
368 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
369 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX7-NEXT: s_and_b32 s7, s4, s8
371 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
372 ; GFX7-NEXT: s_and_b32 s6, s5, s8
373 ; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
374 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
375 ; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
376 ; GFX7-NEXT: v_mov_b32_e32 v2, s8
377 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
378 ; GFX7-NEXT: s_lshr_b32 s5, s5, 24
379 ; GFX7-NEXT: v_mov_b32_e32 v3, s10
380 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24
381 ; GFX7-NEXT: s_waitcnt vmcnt(0)
382 ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
383 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
384 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
385 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
386 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
387 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
388 ; GFX7-NEXT: s_endpgm
390 ; GFX8-LABEL: idot4_acc8:
391 ; GFX8: ; %bb.0: ; %entry
392 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
393 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
394 ; GFX8-NEXT: s_movk_i32 s2, 0xff
395 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
397 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
398 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
399 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
400 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
401 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
403 ; GFX8-NEXT: s_and_b32 s3, s1, s2
404 ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
405 ; GFX8-NEXT: s_and_b32 s2, s0, s2
406 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
407 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
408 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
409 ; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
410 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24
411 ; GFX8-NEXT: v_mov_b32_e32 v5, s6
412 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24
413 ; GFX8-NEXT: s_waitcnt vmcnt(0)
414 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
415 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
416 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
417 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
418 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
419 ; GFX8-NEXT: flat_store_byte v[0:1], v2
420 ; GFX8-NEXT: s_endpgm
422 ; GFX9-NODL-LABEL: idot4_acc8:
423 ; GFX9-NODL: ; %bb.0: ; %entry
424 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
425 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
426 ; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
427 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
429 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
430 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
431 ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
432 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
433 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
434 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
435 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
436 ; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
437 ; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
438 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
439 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
440 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
441 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
442 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
443 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
444 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
445 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
446 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
447 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
448 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
449 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
450 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
451 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
452 ; GFX9-NODL-NEXT: s_endpgm
454 ; GFX9-DL-LABEL: idot4_acc8:
455 ; GFX9-DL: ; %bb.0: ; %entry
456 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
457 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
458 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
460 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
461 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
462 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
463 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
464 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
465 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
466 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
467 ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
468 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
469 ; GFX9-DL-NEXT: s_endpgm
470 <4 x i8> addrspace(1)* %src2,
471 i8 addrspace(1)* nocapture %dst) {
473 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
474 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
476 %v1e0 = extractelement <4 x i8> %vec1, i64 0
477 %v2e0 = extractelement <4 x i8> %vec2, i64 0
478 %mul1 = mul i8 %v1e0, %v2e0
480 %v1e1 = extractelement <4 x i8> %vec1, i64 1
481 %v2e1 = extractelement <4 x i8> %vec2, i64 1
482 %mul2 = mul i8 %v1e1, %v2e1
484 %v1e2 = extractelement <4 x i8> %vec1, i64 2
485 %v2e2 = extractelement <4 x i8> %vec2, i64 2
486 %mul3 = mul i8 %v1e2, %v2e2
488 %v1e3 = extractelement <4 x i8> %vec1, i64 3
489 %v2e3 = extractelement <4 x i8> %vec2, i64 3
490 %mul4 = mul i8 %v1e3, %v2e3
492 %acc = load i8, i8 addrspace(1)* %dst, align 2
493 %add1 = add i8 %mul1, %acc
494 %add2 = add i8 %add1, %mul2
495 %add3 = add i8 %add2, %mul3
496 %add4 = add nsw i8 %add3, %mul4
497 store i8 %add4, i8 addrspace(1)* %dst, align 2
501 define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
502 ; GFX7-LABEL: idot4_multiuse_mul1:
503 ; GFX7: ; %bb.0: ; %entry
504 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
505 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
506 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
507 ; GFX7-NEXT: s_mov_b32 s2, -1
508 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
509 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
510 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
511 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
512 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
513 ; GFX7-NEXT: s_sext_i32_i8 s7, s4
514 ; GFX7-NEXT: s_sext_i32_i8 s8, s5
515 ; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008
516 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
517 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
518 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
519 ; GFX7-NEXT: v_mad_i32_i24 v1, s7, v0, v1
520 ; GFX7-NEXT: v_mov_b32_e32 v2, s10
521 ; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
522 ; GFX7-NEXT: v_mad_i32_i24 v1, s9, v2, v1
523 ; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010
524 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v0, v1
525 ; GFX7-NEXT: v_mov_b32_e32 v1, s12
526 ; GFX7-NEXT: s_ashr_i32 s5, s5, 24
527 ; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0
528 ; GFX7-NEXT: s_ashr_i32 s4, s4, 24
529 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
530 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0
531 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
532 ; GFX7-NEXT: s_endpgm
534 ; GFX8-LABEL: idot4_multiuse_mul1:
535 ; GFX8: ; %bb.0: ; %entry
536 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
537 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
538 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
539 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
540 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
541 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
542 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
543 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
544 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
545 ; GFX8-NEXT: s_sext_i32_i8 s0, s2
546 ; GFX8-NEXT: s_sext_i32_i8 s1, s3
547 ; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008
548 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
549 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
550 ; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
551 ; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3
552 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
553 ; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010
554 ; GFX8-NEXT: v_mad_i32_i24 v3, s5, v4, v3
555 ; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
556 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
557 ; GFX8-NEXT: v_mov_b32_e32 v3, s8
558 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24
559 ; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2
560 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24
561 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
562 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
563 ; GFX8-NEXT: flat_store_dword v[0:1], v2
564 ; GFX8-NEXT: s_endpgm
566 ; GFX9-NODL-LABEL: idot4_multiuse_mul1:
567 ; GFX9-NODL: ; %bb.0: ; %entry
568 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
569 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
570 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
571 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
572 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
573 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
574 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
575 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
576 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2
578 ; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3
579 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008
580 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1
581 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
582 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
583 ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v2, v3
584 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s6
585 ; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010
586 ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s5, v4, v3
587 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
588 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
589 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8
590 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
591 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2
592 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
593 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
594 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
595 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
596 ; GFX9-NODL-NEXT: s_endpgm
598 ; GFX9-DL-LABEL: idot4_multiuse_mul1:
599 ; GFX9-DL: ; %bb.0: ; %entry
600 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
601 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
602 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
603 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
604 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
605 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
606 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
607 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
608 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
609 ; GFX9-DL-NEXT: s_sext_i32_i8 s0, s2
610 ; GFX9-DL-NEXT: s_sext_i32_i8 s1, s3
611 ; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80008
612 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1
613 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
614 ; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80008
615 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3
616 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
617 ; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010
618 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, s5, v4, v3
619 ; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80010
620 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
621 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8
622 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24
623 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v3, v2
624 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24
625 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
626 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
627 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
628 ; GFX9-DL-NEXT: s_endpgm
629 <4 x i8> addrspace(1)* %src2,
630 i32 addrspace(1)* nocapture %dst) {
632 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
633 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
635 %v1e0 = extractelement <4 x i8> %vec1, i64 0
636 %cv1e0 = sext i8 %v1e0 to i32
637 %v2e0 = extractelement <4 x i8> %vec2, i64 0
638 %cv2e0 = sext i8 %v2e0 to i32
639 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
641 %v1e1 = extractelement <4 x i8> %vec1, i64 1
642 %cv1e1 = sext i8 %v1e1 to i32
643 %v2e1 = extractelement <4 x i8> %vec2, i64 1
644 %cv2e1 = sext i8 %v2e1 to i32
645 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
647 %v1e2 = extractelement <4 x i8> %vec1, i64 2
648 %cv1e2 = sext i8 %v1e2 to i32
649 %v2e2 = extractelement <4 x i8> %vec2, i64 2
650 %cv2e2 = sext i8 %v2e2 to i32
651 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
653 %v1e3 = extractelement <4 x i8> %vec1, i64 3
654 %cv1e3 = sext i8 %v1e3 to i32
655 %v2e3 = extractelement <4 x i8> %vec2, i64 3
656 %cv2e3 = sext i8 %v2e3 to i32
657 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
659 %acc = load i32, i32 addrspace(1)* %dst, align 4
660 %add = add i32 %mul1, %acc
661 %add1 = add i32 %mul2, %add
662 %add2 = add i32 %add1, %mul1
663 %add3 = add i32 %add2, %mul3
664 %add4 = add i32 %add3, %mul4
666 store i32 %add4, i32 addrspace(1)* %dst, align 4
670 ; TODO: Support this pattern.
671 define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
672 ; GFX7-LABEL: idot4_acc32_vecMul:
673 ; GFX7: ; %bb.0: ; %entry
674 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
675 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
676 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
677 ; GFX7-NEXT: s_mov_b32 s2, -1
678 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
679 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
680 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
681 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
682 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
683 ; GFX7-NEXT: s_ashr_i32 s7, s4, 24
684 ; GFX7-NEXT: s_ashr_i32 s10, s5, 24
685 ; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010
686 ; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80008
687 ; GFX7-NEXT: s_sext_i32_i8 s5, s5
688 ; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80010
689 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
690 ; GFX7-NEXT: s_sext_i32_i8 s4, s4
691 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
692 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
693 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1
694 ; GFX7-NEXT: v_mov_b32_e32 v1, s12
695 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0
696 ; GFX7-NEXT: v_mov_b32_e32 v1, s11
697 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
698 ; GFX7-NEXT: v_mov_b32_e32 v1, s10
699 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v1, v0
700 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
701 ; GFX7-NEXT: s_endpgm
703 ; GFX8-LABEL: idot4_acc32_vecMul:
704 ; GFX8: ; %bb.0: ; %entry
705 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
706 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
707 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
708 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
709 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
710 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
711 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
712 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
713 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
714 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 8, s2
715 ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s3
716 ; GFX8-NEXT: s_ashr_i32 s5, s3, 24
717 ; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80010
718 ; GFX8-NEXT: s_sext_i32_i8 s3, s3
719 ; GFX8-NEXT: s_ashr_i32 s0, s2, 24
720 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80010
721 ; GFX8-NEXT: s_sext_i32_i8 s2, s2
722 ; GFX8-NEXT: v_mov_b32_e32 v4, s3
723 ; GFX8-NEXT: v_mov_b32_e32 v5, s4
724 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
725 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
726 ; GFX8-NEXT: v_mad_i32_i24 v4, s2, v4, v5
727 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4
728 ; GFX8-NEXT: v_mov_b32_e32 v3, s6
729 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
730 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
731 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
732 ; GFX8-NEXT: flat_store_dword v[0:1], v2
733 ; GFX8-NEXT: s_endpgm
735 ; GFX9-NODL-LABEL: idot4_acc32_vecMul:
736 ; GFX9-NODL: ; %bb.0: ; %entry
737 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
738 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
739 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
740 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
741 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
742 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
743 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
744 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
745 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s2
747 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v3, 8, s3
748 ; GFX9-NODL-NEXT: s_ashr_i32 s5, s3, 24
749 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80010
750 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3
751 ; GFX9-NODL-NEXT: s_ashr_i32 s0, s2, 24
752 ; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80010
753 ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2
754 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
755 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s4
756 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
757 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
758 ; GFX9-NODL-NEXT: v_mad_i32_i24 v4, s2, v4, v5
759 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v2, v3, v4
760 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
761 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
762 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
763 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
764 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
765 ; GFX9-NODL-NEXT: s_endpgm
767 ; GFX9-DL-LABEL: idot4_acc32_vecMul:
768 ; GFX9-DL: ; %bb.0: ; %entry
769 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
770 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
771 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
772 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
773 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
774 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
775 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
776 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
777 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
778 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2
779 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3
780 ; GFX9-DL-NEXT: s_ashr_i32 s5, s3, 24
781 ; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80010
782 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3
783 ; GFX9-DL-NEXT: s_ashr_i32 s0, s2, 24
784 ; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x80010
785 ; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2
786 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
787 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4
788 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
789 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
790 ; GFX9-DL-NEXT: v_mad_i32_i24 v4, s2, v4, v5
791 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4
792 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
793 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
794 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
795 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
796 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
797 ; GFX9-DL-NEXT: s_endpgm
798 <4 x i8> addrspace(1)* %src2,
799 i32 addrspace(1)* nocapture %dst) {
801 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
802 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
804 %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
805 %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
807 %mul = mul <4 x i32> %cvec1, %cvec2
808 %mul0 = extractelement <4 x i32> %mul, i64 0
809 %mul1 = extractelement <4 x i32> %mul, i64 1
810 %mul2 = extractelement <4 x i32> %mul, i64 2
811 %mul3 = extractelement <4 x i32> %mul, i64 3
813 %acc = load i32, i32 addrspace(1)* %dst, align 4
814 %add1 = add i32 %mul0, %acc
815 %add2 = add i32 %add1, %mul1
816 %add3 = add i32 %add2, %mul2
817 %add4 = add i32 %add3, %mul3
819 store i32 %add4, i32 addrspace(1)* %dst, align 4
823 define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
824 ; GFX7-LABEL: idot4_acc16_vecMul:
825 ; GFX7: ; %bb.0: ; %entry
826 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
827 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
828 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
829 ; GFX7-NEXT: s_mov_b32 s2, -1
830 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
831 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
832 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
833 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
834 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
835 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
836 ; GFX7-NEXT: s_sext_i32_i8 s6, s4
837 ; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80008
838 ; GFX7-NEXT: s_sext_i32_i8 s10, s5
839 ; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008
840 ; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
841 ; GFX7-NEXT: s_ashr_i32 s5, s5, 24
842 ; GFX7-NEXT: v_mov_b32_e32 v3, s11
843 ; GFX7-NEXT: v_mov_b32_e32 v4, s10
844 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80010
845 ; GFX7-NEXT: v_mov_b32_e32 v2, s12
846 ; GFX7-NEXT: s_ashr_i32 s4, s4, 24
847 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
848 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
849 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s9, v2
850 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s7, v3
851 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s6, v4
852 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
853 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2
854 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
855 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v4
856 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
857 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
858 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
859 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
860 ; GFX7-NEXT: s_waitcnt vmcnt(0)
861 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
862 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
863 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
864 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
865 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
866 ; GFX7-NEXT: s_endpgm
868 ; GFX8-LABEL: idot4_acc16_vecMul:
869 ; GFX8: ; %bb.0: ; %entry
870 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
871 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
872 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
874 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
875 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
876 ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
877 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
878 ; GFX8-NEXT: s_mov_b32 s0, 0xffff
879 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
880 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
881 ; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80000
882 ; GFX8-NEXT: s_lshr_b32 s4, s2, 16
883 ; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80000
884 ; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s1
885 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x80000
886 ; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s3
887 ; GFX8-NEXT: s_and_b32 s3, s0, s6
888 ; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s2
889 ; GFX8-NEXT: s_bfe_i32 s2, s4, 0x80000
890 ; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4
891 ; GFX8-NEXT: s_and_b32 s4, s0, s5
892 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
893 ; GFX8-NEXT: s_and_b32 s2, s0, s2
894 ; GFX8-NEXT: s_and_b32 s0, s0, s1
895 ; GFX8-NEXT: s_waitcnt vmcnt(0)
896 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2
897 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2
898 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
899 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
900 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2
901 ; GFX8-NEXT: flat_store_short v[0:1], v2
902 ; GFX8-NEXT: s_endpgm
904 ; GFX9-NODL-LABEL: idot4_acc16_vecMul:
905 ; GFX9-NODL: ; %bb.0: ; %entry
906 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
907 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
908 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff
909 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
910 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
911 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
912 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
913 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16
914 ; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16
915 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5
916 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
917 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4
918 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s5, v4
919 ; GFX9-NODL-NEXT: s_bfe_i32 s4, s4, 0x80000
920 ; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5
921 ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4
922 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
923 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3
924 ; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000
925 ; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2
926 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
927 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4
928 ; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
929 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
930 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4
931 ; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
932 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1
933 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
934 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
935 ; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off
936 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
937 ; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4
938 ; GFX9-NODL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
939 ; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v2
940 ; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
941 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
942 ; GFX9-NODL-NEXT: s_endpgm
944 ; GFX9-DL-LABEL: idot4_acc16_vecMul:
945 ; GFX9-DL: ; %bb.0: ; %entry
946 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
947 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
948 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff
949 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
951 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
952 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
953 ; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16
954 ; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16
955 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5
956 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
957 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4
958 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s5, v4
959 ; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80000
960 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5
961 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4
962 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
963 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3
964 ; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000
965 ; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2
966 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
967 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4
968 ; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
969 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
970 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4
971 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
972 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1
973 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
974 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
975 ; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off
976 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
977 ; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4
978 ; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
979 ; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2
980 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
981 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
982 ; GFX9-DL-NEXT: s_endpgm
983 <4 x i8> addrspace(1)* %src2,
984 i16 addrspace(1)* nocapture %dst) {
986 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
987 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
989 %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
990 %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
992 %mul = mul <4 x i16> %cvec1, %cvec2
993 %mul0 = extractelement <4 x i16> %mul, i64 0
994 %mul1 = extractelement <4 x i16> %mul, i64 1
995 %mul2 = extractelement <4 x i16> %mul, i64 2
996 %mul3 = extractelement <4 x i16> %mul, i64 3
998 %acc = load i16, i16 addrspace(1)* %dst, align 4
999 %add1 = add i16 %mul0, %acc
1000 %add2 = add i16 %add1, %mul1
1001 %add3 = add i16 %add2, %mul2
1002 %add4 = add i16 %add3, %mul3
1004 store i16 %add4, i16 addrspace(1)* %dst, align 4