1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
11 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
12 ; GFX7-LABEL: idot8_acc32:
13 ; GFX7: ; %bb.0: ; %entry
14 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
15 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
16 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
17 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
18 ; GFX7-NEXT: s_mov_b32 s14, -1
19 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
20 ; GFX7-NEXT: s_add_u32 s12, s12, s3
21 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
22 ; GFX7-NEXT: s_mov_b32 s10, 0
23 ; GFX7-NEXT: s_mov_b32 s11, s3
24 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
26 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
27 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
28 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
29 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
30 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
31 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
32 ; GFX7-NEXT: s_mov_b32 s2, -1
33 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
34 ; GFX7-NEXT: s_waitcnt vmcnt(1)
35 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
36 ; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
37 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38 ; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
39 ; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
40 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
42 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
43 ; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
44 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
45 ; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
46 ; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
47 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
48 ; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
49 ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
50 ; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
51 ; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
52 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
53 ; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
54 ; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
55 ; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
56 ; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
57 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
58 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
59 ; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
60 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
61 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
64 ; GFX8-LABEL: idot8_acc32:
65 ; GFX8: ; %bb.0: ; %entry
66 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
67 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
68 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
69 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
70 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
71 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
72 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
73 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
74 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
75 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
76 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
77 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
78 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
79 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
80 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
81 ; GFX8-NEXT: s_mov_b32 s10, -1
82 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
83 ; GFX8-NEXT: s_add_u32 s8, s8, s3
84 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
85 ; GFX8-NEXT: s_waitcnt vmcnt(1)
86 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
87 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
88 ; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
89 ; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
90 ; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
91 ; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
92 ; GFX8-NEXT: s_waitcnt vmcnt(0)
93 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
94 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
95 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
97 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
98 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
99 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
100 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
101 ; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
102 ; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
103 ; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
104 ; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
105 ; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
106 ; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
107 ; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
108 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
109 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
110 ; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
111 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
112 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
113 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
114 ; GFX8-NEXT: flat_store_dword v[0:1], v2
115 ; GFX8-NEXT: s_endpgm
117 ; GFX9-LABEL: idot8_acc32:
118 ; GFX9: ; %bb.0: ; %entry
119 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
120 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
121 ; GFX9-NEXT: s_mov_b32 s10, -1
122 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
123 ; GFX9-NEXT: s_add_u32 s8, s8, s3
124 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
125 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
126 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
127 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
128 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
130 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
131 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
132 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
133 ; GFX9-NEXT: s_waitcnt vmcnt(1)
134 ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4
137 ; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4
138 ; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4
139 ; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4
140 ; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4
141 ; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4
142 ; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4
143 ; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4
144 ; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4
145 ; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4
146 ; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4
147 ; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4
148 ; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4
149 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
150 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2
151 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4
152 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6
153 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8
154 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10
155 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
156 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4
158 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12
159 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14
160 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6
161 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16
162 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8
163 ; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1
164 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
165 ; GFX9-NEXT: s_endpgm
167 ; GFX9-DL-LABEL: idot8_acc32:
168 ; GFX9-DL: ; %bb.0: ; %entry
169 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
170 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
171 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
172 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
173 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
174 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
175 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
176 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
177 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
178 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
180 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
181 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
182 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
183 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
184 ; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
185 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
186 ; GFX9-DL-NEXT: s_endpgm
188 ; GFX10-DL-XNACK-LABEL: idot8_acc32:
189 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
190 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
191 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
192 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
193 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
194 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
195 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
196 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
197 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
198 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
199 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
201 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
202 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
203 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
204 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
205 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
206 ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
207 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
208 ; GFX10-DL-XNACK-NEXT: s_endpgm
210 ; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
211 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
212 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
213 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
214 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
215 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
216 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
217 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
218 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
219 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
220 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
221 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
222 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
224 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
225 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
226 ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
227 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
228 ; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
229 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
230 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
231 ; GFX10-DL-LABEL: idot8_acc32:
232 ; GFX10-DL: ; %bb.0: ; %entry
233 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
234 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
235 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
236 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
237 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
238 ; GFX10-DL-NEXT: s_clause 0x1
239 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
240 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
241 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
242 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
243 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
245 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
246 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
247 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
249 ; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
250 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
251 ; GFX10-DL-NEXT: s_endpgm
252 <8 x i4> addrspace(1)* %src2,
253 i32 addrspace(1)* nocapture %dst) {
255 %idx = call i32 @llvm.amdgcn.workitem.id.x()
256 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
257 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
258 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
259 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
261 %v1e0 = extractelement <8 x i4> %vec1, i64 0
262 %cv1e0 = sext i4 %v1e0 to i32
263 %v2e0 = extractelement <8 x i4> %vec2, i64 0
264 %cv2e0 = sext i4 %v2e0 to i32
265 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
267 %v1e1 = extractelement <8 x i4> %vec1, i64 1
268 %cv1e1 = sext i4 %v1e1 to i32
269 %v2e1 = extractelement <8 x i4> %vec2, i64 1
270 %cv2e1 = sext i4 %v2e1 to i32
271 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
273 %v1e2 = extractelement <8 x i4> %vec1, i64 2
274 %cv1e2 = sext i4 %v1e2 to i32
275 %v2e2 = extractelement <8 x i4> %vec2, i64 2
276 %cv2e2 = sext i4 %v2e2 to i32
277 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
279 %v1e3 = extractelement <8 x i4> %vec1, i64 3
280 %cv1e3 = sext i4 %v1e3 to i32
281 %v2e3 = extractelement <8 x i4> %vec2, i64 3
282 %cv2e3 = sext i4 %v2e3 to i32
283 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
285 %v1e4 = extractelement <8 x i4> %vec1, i64 4
286 %cv1e4 = sext i4 %v1e4 to i32
287 %v2e4 = extractelement <8 x i4> %vec2, i64 4
288 %cv2e4 = sext i4 %v2e4 to i32
289 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
291 %v1e5 = extractelement <8 x i4> %vec1, i64 5
292 %cv1e5 = sext i4 %v1e5 to i32
293 %v2e5 = extractelement <8 x i4> %vec2, i64 5
294 %cv2e5 = sext i4 %v2e5 to i32
295 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
297 %v1e6 = extractelement <8 x i4> %vec1, i64 6
298 %cv1e6 = sext i4 %v1e6 to i32
299 %v2e6 = extractelement <8 x i4> %vec2, i64 6
300 %cv2e6 = sext i4 %v2e6 to i32
301 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
303 %v1e7 = extractelement <8 x i4> %vec1, i64 7
304 %cv1e7 = sext i4 %v1e7 to i32
305 %v2e7 = extractelement <8 x i4> %vec2, i64 7
306 %cv2e7 = sext i4 %v2e7 to i32
307 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
309 %acc = load i32, i32 addrspace(1)* %dst, align 4
310 %add1 = add i32 %mul0, %acc
311 %add2 = add i32 %add1, %mul1
312 %add3 = add i32 %add2, %mul2
313 %add4 = add i32 %add3, %mul3
314 %add5 = add i32 %add4, %mul4
315 %add6 = add i32 %add5, %mul5
316 %add7 = add i32 %add6, %mul6
317 %add8 = add i32 %add7, %mul7
319 store i32 %add8, i32 addrspace(1)* %dst, align 4
323 ; TODO: Once the unnecessary zero extentions of the elements are removed;
324 ; pattern recognizer will kick in.
325 define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
326 ; GFX7-LABEL: idot8_acc16:
327 ; GFX7: ; %bb.0: ; %entry
328 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
329 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
330 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
331 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
332 ; GFX7-NEXT: s_mov_b32 s14, -1
333 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
334 ; GFX7-NEXT: s_add_u32 s12, s12, s3
335 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
336 ; GFX7-NEXT: s_mov_b32 s10, 0
337 ; GFX7-NEXT: s_mov_b32 s11, s3
338 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
340 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
341 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
342 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
343 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
344 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
345 ; GFX7-NEXT: s_mov_b32 s2, -1
346 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
347 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
348 ; GFX7-NEXT: s_waitcnt vmcnt(2)
349 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
350 ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
351 ; GFX7-NEXT: s_waitcnt vmcnt(1)
352 ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
353 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
354 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
355 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
356 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
357 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
358 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
359 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
360 ; GFX7-NEXT: s_waitcnt vmcnt(0)
361 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
362 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
363 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
364 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
365 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
366 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
367 ; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
368 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
369 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
370 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
371 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
372 ; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
373 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
374 ; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
375 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
376 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
377 ; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
378 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
379 ; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
380 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
381 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
382 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
383 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
384 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
385 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
386 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
387 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
388 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
389 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
390 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
391 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
392 ; GFX7-NEXT: s_endpgm
394 ; GFX8-LABEL: idot8_acc16:
395 ; GFX8: ; %bb.0: ; %entry
396 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
397 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
398 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
399 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
400 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
401 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
403 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
404 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
405 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
406 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
407 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
408 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
409 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
410 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
411 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
412 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
413 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
414 ; GFX8-NEXT: s_mov_b32 s10, -1
415 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
416 ; GFX8-NEXT: s_add_u32 s8, s8, s3
417 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
418 ; GFX8-NEXT: s_waitcnt vmcnt(2)
419 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
420 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
421 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
422 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
423 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
424 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
425 ; GFX8-NEXT: s_waitcnt vmcnt(1)
426 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
427 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2
428 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
429 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
430 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
431 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
432 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
433 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
434 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
435 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
436 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16
437 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
438 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
439 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
440 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
441 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
442 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
443 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
444 ; GFX8-NEXT: s_waitcnt vmcnt(0)
445 ; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4
446 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
447 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
448 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
449 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
450 ; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4
451 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
452 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
453 ; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4
454 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18
455 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
456 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
457 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
458 ; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4
459 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
460 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
461 ; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4
462 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
463 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
464 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
465 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
466 ; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4
467 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
468 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
469 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
470 ; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
471 ; GFX8-NEXT: flat_store_short v[0:1], v2
472 ; GFX8-NEXT: s_endpgm
474 ; GFX9-LABEL: idot8_acc16:
475 ; GFX9: ; %bb.0: ; %entry
476 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
477 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
478 ; GFX9-NEXT: s_mov_b32 s10, -1
479 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
480 ; GFX9-NEXT: s_add_u32 s8, s8, s3
481 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
482 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
483 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
484 ; GFX9-NEXT: v_mov_b32_e32 v4, 12
485 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
486 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
487 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
488 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
489 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
490 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
491 ; GFX9-NEXT: s_waitcnt vmcnt(2)
492 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
493 ; GFX9-NEXT: s_waitcnt vmcnt(1)
494 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
495 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
496 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2
497 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
498 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
499 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
500 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
501 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
502 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
503 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
504 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
505 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
506 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
507 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
508 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
509 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15
510 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
511 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9
512 ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14
513 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
514 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
515 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
516 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
517 ; GFX9-NEXT: s_waitcnt vmcnt(0)
518 ; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
519 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
520 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
521 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
522 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
523 ; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
524 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
525 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
526 ; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
527 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17
528 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
529 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
530 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
531 ; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
532 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
533 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
534 ; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
535 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
536 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
537 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5
538 ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10
539 ; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
540 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
541 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
542 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
543 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
544 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
545 ; GFX9-NEXT: s_endpgm
547 ; GFX9-DL-LABEL: idot8_acc16:
548 ; GFX9-DL: ; %bb.0: ; %entry
549 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
550 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
551 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
552 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
553 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
554 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
555 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
556 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
557 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
558 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
559 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
560 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
561 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
562 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
563 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
564 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
565 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
566 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
567 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
568 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
569 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2
570 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
571 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
572 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
573 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
574 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
575 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
576 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
577 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
578 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
579 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
580 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
581 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
582 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15
583 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
584 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9
585 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
586 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
587 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
588 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
589 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
590 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
591 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
592 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
593 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
594 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
595 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
596 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
597 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
598 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
599 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
600 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
601 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
602 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
603 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
604 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
605 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
606 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
607 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
608 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
609 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
610 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5
611 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
612 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
613 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
614 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
615 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
616 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
617 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
618 ; GFX9-DL-NEXT: s_endpgm
620 ; GFX10-DL-XNACK-LABEL: idot8_acc16:
621 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
622 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
623 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
624 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
625 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
626 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
627 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
628 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
629 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
630 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
631 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
632 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
633 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
634 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
635 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
636 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
637 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
638 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
639 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
640 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
641 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
642 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
643 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
644 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
645 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
646 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
647 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
648 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
649 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
650 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
651 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
652 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
653 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
654 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
655 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
656 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
657 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
658 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
659 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
660 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
661 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
662 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
663 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
664 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3
665 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
666 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
667 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
668 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
669 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1
670 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
671 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
672 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
673 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
674 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
675 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
676 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
677 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
678 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
679 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1
680 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
681 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
682 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
683 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
684 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
685 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
686 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
687 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
688 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
689 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
690 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
691 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
692 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
693 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
694 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
695 ; GFX10-DL-XNACK-NEXT: s_endpgm
697 ; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
698 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
699 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
700 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
701 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
702 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
703 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
704 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
705 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
706 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
707 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
708 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
709 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
710 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
711 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
712 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
713 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
714 ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
715 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
716 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
717 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
718 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
719 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
720 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
721 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
722 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
723 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
724 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
725 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
726 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
727 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
728 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
729 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
730 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
731 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
732 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
733 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
734 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
735 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
736 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
737 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
738 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
739 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
740 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
741 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
742 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3
743 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
744 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
745 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
746 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
747 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
748 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
749 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
750 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
751 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
752 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
753 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
754 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
755 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
756 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
757 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0
758 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
759 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
760 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
761 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
762 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
763 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
764 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
765 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
766 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
767 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
768 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
769 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
770 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
771 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
772 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
773 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
774 ; GFX10-DL-LABEL: idot8_acc16:
775 ; GFX10-DL: ; %bb.0: ; %entry
776 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
777 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
778 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
779 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
780 ; GFX10-DL-NEXT: s_mov_b32 s14, -1
781 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
782 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
783 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
784 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
785 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
787 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
788 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
789 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
790 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
791 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12
792 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
793 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
794 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2
795 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3
796 ; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
797 ; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
798 ; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
799 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
800 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10
801 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
802 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
803 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
804 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
805 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1
806 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1
807 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
808 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
809 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
810 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
811 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
812 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1
813 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
814 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
815 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
816 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
817 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
818 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
819 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
820 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
821 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
822 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
823 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
824 ; GFX10-DL-NEXT: s_endpgm
825 <8 x i4> addrspace(1)* %src2,
826 i16 addrspace(1)* nocapture %dst) {
828 %idx = call i32 @llvm.amdgcn.workitem.id.x()
829 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
830 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
831 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
832 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
834 %v1e0 = extractelement <8 x i4> %vec1, i64 0
835 %cv1e0 = sext i4 %v1e0 to i16
836 %v2e0 = extractelement <8 x i4> %vec2, i64 0
837 %cv2e0 = sext i4 %v2e0 to i16
838 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
840 %v1e1 = extractelement <8 x i4> %vec1, i64 1
841 %cv1e1 = sext i4 %v1e1 to i16
842 %v2e1 = extractelement <8 x i4> %vec2, i64 1
843 %cv2e1 = sext i4 %v2e1 to i16
844 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
846 %v1e2 = extractelement <8 x i4> %vec1, i64 2
847 %cv1e2 = sext i4 %v1e2 to i16
848 %v2e2 = extractelement <8 x i4> %vec2, i64 2
849 %cv2e2 = sext i4 %v2e2 to i16
850 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
852 %v1e3 = extractelement <8 x i4> %vec1, i64 3
853 %cv1e3 = sext i4 %v1e3 to i16
854 %v2e3 = extractelement <8 x i4> %vec2, i64 3
855 %cv2e3 = sext i4 %v2e3 to i16
856 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
858 %v1e4 = extractelement <8 x i4> %vec1, i64 4
859 %cv1e4 = sext i4 %v1e4 to i16
860 %v2e4 = extractelement <8 x i4> %vec2, i64 4
861 %cv2e4 = sext i4 %v2e4 to i16
862 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
864 %v1e5 = extractelement <8 x i4> %vec1, i64 5
865 %cv1e5 = sext i4 %v1e5 to i16
866 %v2e5 = extractelement <8 x i4> %vec2, i64 5
867 %cv2e5 = sext i4 %v2e5 to i16
868 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
870 %v1e6 = extractelement <8 x i4> %vec1, i64 6
871 %cv1e6 = sext i4 %v1e6 to i16
872 %v2e6 = extractelement <8 x i4> %vec2, i64 6
873 %cv2e6 = sext i4 %v2e6 to i16
874 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
876 %v1e7 = extractelement <8 x i4> %vec1, i64 7
877 %cv1e7 = sext i4 %v1e7 to i16
878 %v2e7 = extractelement <8 x i4> %vec2, i64 7
879 %cv2e7 = sext i4 %v2e7 to i16
880 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
882 %acc = load i16, i16 addrspace(1)* %dst, align 4
883 %add1 = add i16 %mul0, %acc
884 %add2 = add i16 %add1, %mul1
885 %add3 = add i16 %add2, %mul2
886 %add4 = add i16 %add3, %mul3
887 %add5 = add i16 %add4, %mul4
888 %add6 = add i16 %add5, %mul5
889 %add7 = add i16 %add6, %mul6
890 %add8 = add i16 %add7, %mul7
892 store i16 %add8, i16 addrspace(1)* %dst, align 4
896 ; TODO: Support this pattern.
897 define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
898 ; GFX7-LABEL: idot8_acc8:
899 ; GFX7: ; %bb.0: ; %entry
900 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
901 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
902 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
903 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
904 ; GFX7-NEXT: s_mov_b32 s14, -1
905 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
906 ; GFX7-NEXT: s_add_u32 s12, s12, s3
907 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
908 ; GFX7-NEXT: s_mov_b32 s10, 0
909 ; GFX7-NEXT: s_mov_b32 s11, s3
910 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
911 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
912 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
913 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
914 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
915 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
916 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
917 ; GFX7-NEXT: s_mov_b32 s2, -1
918 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
919 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
920 ; GFX7-NEXT: s_waitcnt vmcnt(2)
921 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
922 ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
923 ; GFX7-NEXT: s_waitcnt vmcnt(1)
924 ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
925 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
926 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
927 ; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
928 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
929 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
930 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
931 ; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
932 ; GFX7-NEXT: s_waitcnt vmcnt(0)
933 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
934 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
935 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
936 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
937 ; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
938 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
939 ; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
940 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
941 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
942 ; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
943 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
944 ; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
945 ; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
946 ; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
947 ; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
948 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
949 ; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
950 ; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
951 ; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
952 ; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
953 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
954 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
955 ; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
956 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
957 ; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
958 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
959 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
960 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
961 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
962 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
963 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
964 ; GFX7-NEXT: s_endpgm
966 ; GFX8-LABEL: idot8_acc8:
967 ; GFX8: ; %bb.0: ; %entry
968 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
969 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
970 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
971 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
972 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
973 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
974 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
975 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
976 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
977 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
978 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
979 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
980 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
981 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
982 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
983 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
984 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
985 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
986 ; GFX8-NEXT: s_mov_b32 s10, -1
987 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
988 ; GFX8-NEXT: s_add_u32 s8, s8, s3
989 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
990 ; GFX8-NEXT: s_waitcnt vmcnt(2)
991 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
992 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
993 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
994 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
995 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
996 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
997 ; GFX8-NEXT: s_waitcnt vmcnt(1)
998 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
999 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2
1000 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1001 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
1002 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
1003 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
1004 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1005 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1006 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1007 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1008 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16
1009 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
1010 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
1011 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
1012 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
1013 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
1014 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1015 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
1016 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1017 ; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4
1018 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
1019 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
1020 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1021 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1022 ; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4
1023 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1024 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1025 ; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4
1026 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18
1027 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
1028 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
1029 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
1030 ; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4
1031 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1032 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1033 ; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4
1034 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
1035 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
1036 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
1037 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
1038 ; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4
1039 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1040 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1041 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
1042 ; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
1043 ; GFX8-NEXT: flat_store_byte v[0:1], v2
1044 ; GFX8-NEXT: s_endpgm
1046 ; GFX9-LABEL: idot8_acc8:
1047 ; GFX9: ; %bb.0: ; %entry
1048 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1049 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1050 ; GFX9-NEXT: s_mov_b32 s10, -1
1051 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1052 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1053 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1054 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1055 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1056 ; GFX9-NEXT: v_mov_b32_e32 v4, 12
1057 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1058 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1059 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1060 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1061 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
1062 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1063 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1064 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
1065 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1066 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
1067 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
1068 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2
1069 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
1070 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1071 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
1072 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
1073 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1074 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
1075 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
1076 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
1077 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1078 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1079 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1080 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1081 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15
1082 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
1083 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9
1084 ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14
1085 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
1086 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
1087 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1088 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1089 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1090 ; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
1091 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
1092 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
1093 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1094 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1095 ; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
1096 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1097 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1098 ; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
1099 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17
1100 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
1101 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
1102 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
1103 ; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
1104 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1105 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1106 ; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
1107 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
1108 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
1109 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5
1110 ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10
1111 ; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
1112 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1113 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1114 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1115 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
1116 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
1117 ; GFX9-NEXT: s_endpgm
1119 ; GFX9-DL-LABEL: idot8_acc8:
1120 ; GFX9-DL: ; %bb.0: ; %entry
1121 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1122 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1123 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1124 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1125 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1126 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1127 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1128 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1129 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
1130 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1131 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1132 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1133 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1134 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
1135 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1136 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1137 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
1138 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1139 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
1140 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
1141 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2
1142 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
1143 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1144 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
1145 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
1146 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1147 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
1148 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
1149 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
1150 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1151 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1152 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1153 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1154 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15
1155 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
1156 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9
1157 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
1158 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
1159 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
1160 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1161 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1162 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1163 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
1164 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
1165 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
1166 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1167 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1168 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
1169 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1170 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1171 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
1172 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
1173 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
1174 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
1175 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
1176 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
1177 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1178 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1179 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
1180 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
1181 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
1182 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5
1183 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
1184 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
1185 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1186 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1187 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1188 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
1189 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
1190 ; GFX9-DL-NEXT: s_endpgm
1192 ; GFX10-DL-XNACK-LABEL: idot8_acc8:
1193 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
1194 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1195 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1196 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1197 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1198 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1199 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
1200 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
1201 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
1202 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
1203 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
1204 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
1205 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
1206 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
1207 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
1208 ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1]
1209 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
1210 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1211 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1212 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1213 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1214 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
1215 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
1216 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
1217 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
1218 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
1219 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
1220 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
1221 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1222 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
1223 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
1224 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1225 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
1226 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
1227 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
1228 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
1229 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
1230 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
1231 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
1232 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
1233 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
1234 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
1235 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
1236 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3
1237 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
1238 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
1239 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
1240 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
1241 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1
1242 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
1243 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
1244 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
1245 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
1246 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
1247 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
1248 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
1249 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
1250 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
1251 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1
1252 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
1253 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
1254 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
1255 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
1256 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
1257 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
1258 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
1259 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
1260 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
1261 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
1262 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
1263 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
1264 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
1265 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
1266 ; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
1267 ; GFX10-DL-XNACK-NEXT: s_endpgm
1269 ; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
1270 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
1271 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
1272 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1273 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1274 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1275 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
1276 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1277 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1278 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
1279 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
1280 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
1281 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
1282 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
1283 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
1284 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
1285 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
1286 ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1]
1287 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
1288 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1289 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1290 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1291 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1292 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
1293 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
1294 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
1295 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
1296 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
1297 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
1298 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
1299 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
1300 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
1301 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
1302 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
1303 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
1304 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
1305 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
1306 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
1307 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
1308 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
1309 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
1310 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
1311 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
1312 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
1313 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
1314 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3
1315 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
1316 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
1317 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
1318 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
1319 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
1320 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
1321 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
1322 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
1323 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
1324 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
1325 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
1326 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
1327 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
1328 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
1329 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0
1330 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
1331 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
1332 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
1333 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
1334 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
1335 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
1336 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
1337 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
1338 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
1339 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
1340 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
1341 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
1342 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
1343 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
1344 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
1345 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
1346 ; GFX10-DL-LABEL: idot8_acc8:
1347 ; GFX10-DL: ; %bb.0: ; %entry
1348 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1349 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1350 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1351 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1352 ; GFX10-DL-NEXT: s_mov_b32 s14, -1
1353 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
1354 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
1355 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1356 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
1357 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1358 ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
1359 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
1360 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
1361 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1362 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
1363 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12
1364 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
1365 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
1366 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2
1367 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3
1368 ; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
1369 ; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
1370 ; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
1371 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
1372 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10
1373 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
1374 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
1375 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
1376 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1377 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1
1378 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1
1379 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
1380 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
1381 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
1382 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1383 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
1384 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1
1385 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1386 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
1387 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
1388 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1389 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
1390 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
1391 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
1392 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
1393 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1394 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
1395 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
1396 ; GFX10-DL-NEXT: s_endpgm
1397 <8 x i4> addrspace(1)* %src2,
1398 i8 addrspace(1)* nocapture %dst) {
1400 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1401 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1402 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1403 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1404 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1406 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1407 %cv1e0 = sext i4 %v1e0 to i8
1408 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1409 %cv2e0 = sext i4 %v2e0 to i8
1410 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
1412 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1413 %cv1e1 = sext i4 %v1e1 to i8
1414 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1415 %cv2e1 = sext i4 %v2e1 to i8
1416 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
1418 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1419 %cv1e2 = sext i4 %v1e2 to i8
1420 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1421 %cv2e2 = sext i4 %v2e2 to i8
1422 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
1424 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1425 %cv1e3 = sext i4 %v1e3 to i8
1426 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1427 %cv2e3 = sext i4 %v2e3 to i8
1428 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
1430 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1431 %cv1e4 = sext i4 %v1e4 to i8
1432 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1433 %cv2e4 = sext i4 %v2e4 to i8
1434 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
1436 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1437 %cv1e5 = sext i4 %v1e5 to i8
1438 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1439 %cv2e5 = sext i4 %v2e5 to i8
1440 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
1442 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1443 %cv1e6 = sext i4 %v1e6 to i8
1444 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1445 %cv2e6 = sext i4 %v2e6 to i8
1446 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
1448 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1449 %cv1e7 = sext i4 %v1e7 to i8
1450 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1451 %cv2e7 = sext i4 %v2e7 to i8
1452 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
1454 %acc = load i8, i8 addrspace(1)* %dst, align 4
1455 %add1 = add i8 %mul0, %acc
1456 %add2 = add i8 %add1, %mul1
1457 %add3 = add i8 %add2, %mul2
1458 %add4 = add i8 %add3, %mul3
1459 %add5 = add i8 %add4, %mul4
1460 %add6 = add i8 %add5, %mul5
1461 %add7 = add i8 %add6, %mul6
1462 %add8 = add i8 %add7, %mul7
1464 store i8 %add8, i8 addrspace(1)* %dst, align 4
1468 ; Make sure the pattern is not recognized if there are multiple uses of the
1469 ; intermediate multiplications.
1470 define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1471 ; GFX7-LABEL: idot8_multiuses_mul1:
1472 ; GFX7: ; %bb.0: ; %entry
1473 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1474 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1475 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1476 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1477 ; GFX7-NEXT: s_mov_b32 s14, -1
1478 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1479 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1480 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1481 ; GFX7-NEXT: s_mov_b32 s10, 0
1482 ; GFX7-NEXT: s_mov_b32 s11, s3
1483 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1484 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1485 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1486 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1487 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1488 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1489 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1490 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1491 ; GFX7-NEXT: s_mov_b32 s2, -1
1492 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1493 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1494 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
1495 ; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
1496 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1497 ; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
1498 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1499 ; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
1500 ; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
1501 ; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
1502 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
1503 ; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
1504 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
1505 ; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
1506 ; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
1507 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
1508 ; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
1509 ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
1510 ; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
1511 ; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
1512 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
1513 ; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
1514 ; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
1515 ; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
1516 ; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
1517 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1518 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
1519 ; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
1520 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
1521 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0
1522 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1523 ; GFX7-NEXT: s_endpgm
1525 ; GFX8-LABEL: idot8_multiuses_mul1:
1526 ; GFX8: ; %bb.0: ; %entry
1527 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1528 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1529 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1530 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1531 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1533 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1534 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1535 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1536 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1537 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1538 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1539 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1540 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1541 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1542 ; GFX8-NEXT: s_mov_b32 s10, -1
1543 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1544 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1545 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1546 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1547 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
1548 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
1549 ; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
1550 ; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
1551 ; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
1552 ; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
1553 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1554 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
1555 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1556 ; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2
1557 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
1558 ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
1559 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
1560 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
1561 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
1562 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
1563 ; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
1564 ; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
1565 ; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
1566 ; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
1567 ; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
1568 ; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
1569 ; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
1570 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
1571 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
1572 ; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
1573 ; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
1574 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
1575 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1576 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1577 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1578 ; GFX8-NEXT: s_endpgm
1580 ; GFX9-LABEL: idot8_multiuses_mul1:
1581 ; GFX9: ; %bb.0: ; %entry
1582 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1583 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1584 ; GFX9-NEXT: s_mov_b32 s10, -1
1585 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1586 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1587 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1588 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1589 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1590 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1591 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1592 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1593 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1594 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1595 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1596 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1597 ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
1598 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1599 ; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4
1600 ; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4
1601 ; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4
1602 ; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4
1603 ; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4
1604 ; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4
1605 ; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4
1606 ; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4
1607 ; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4
1608 ; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4
1609 ; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4
1610 ; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4
1611 ; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4
1612 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1613 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1614 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
1615 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1616 ; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0
1617 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6
1618 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8
1619 ; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2
1620 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v9, v10
1621 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v11, v12
1622 ; GFX9-NEXT: v_add3_u32 v3, v3, v5, v6
1623 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v13, v14
1624 ; GFX9-NEXT: v_mul_i32_i24_e32 v10, v15, v16
1625 ; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8
1626 ; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10
1627 ; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2
1628 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
1629 ; GFX9-NEXT: s_endpgm
1631 ; GFX9-DL-LABEL: idot8_multiuses_mul1:
1632 ; GFX9-DL: ; %bb.0: ; %entry
1633 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1634 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1635 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1636 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1637 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1638 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1639 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1640 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1641 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1642 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1643 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1644 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1645 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1646 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1647 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1648 ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4
1649 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1650 ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 4
1651 ; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 4, 4
1652 ; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 4, 4
1653 ; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 8, 4
1654 ; GFX9-DL-NEXT: v_bfe_i32 v8, v2, 8, 4
1655 ; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 12, 4
1656 ; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 12, 4
1657 ; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4
1658 ; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4
1659 ; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4
1660 ; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4
1661 ; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4
1662 ; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4
1663 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1664 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1665 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2
1666 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1667 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0
1668 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6
1669 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8
1670 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2
1671 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v9, v10
1672 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v11, v12
1673 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v5, v6
1674 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v13, v14
1675 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v10, v15, v16
1676 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8
1677 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10
1678 ; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2
1679 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1680 ; GFX9-DL-NEXT: s_endpgm
1682 ; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1:
1683 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
1684 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1685 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1686 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1687 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1688 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1689 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
1690 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
1691 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
1692 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
1693 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
1694 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
1695 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
1696 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
1697 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
1698 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
1699 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4
1700 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
1701 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
1702 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4
1703 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4
1704 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4
1705 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4
1706 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4
1707 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4
1708 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4
1709 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
1710 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
1711 ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
1712 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
1713 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
1714 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
1715 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4
1716 ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5
1717 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
1718 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4
1719 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
1720 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
1721 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
1722 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
1723 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
1724 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1725 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1726 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6
1727 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2
1728 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
1729 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
1730 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
1731 ; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
1732 ; GFX10-DL-XNACK-NEXT: s_endpgm
1734 ; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
1735 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
1736 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1737 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1738 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1739 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1740 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1741 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
1742 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
1743 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
1744 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
1745 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
1746 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
1747 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
1748 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
1749 ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
1750 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
1751 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4
1752 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
1753 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
1754 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4
1755 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4
1756 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4
1757 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4
1758 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4
1759 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4
1760 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4
1761 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
1762 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
1763 ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
1764 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
1765 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
1766 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
1767 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4
1768 ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5
1769 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
1770 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4
1771 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
1772 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
1773 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4
1774 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
1775 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
1776 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1777 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0
1778 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6
1779 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0
1780 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
1781 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
1782 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
1783 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
1784 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
1785 ; GFX10-DL-LABEL: idot8_multiuses_mul1:
1786 ; GFX10-DL: ; %bb.0: ; %entry
1787 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1788 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1789 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1790 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1791 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1792 ; GFX10-DL-NEXT: s_clause 0x1
1793 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1794 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1795 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1796 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1797 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1798 ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
1799 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
1800 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
1801 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1802 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
1803 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000
1804 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000
1805 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0
1806 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0
1807 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004
1808 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004
1809 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1810 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008
1811 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008
1812 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1813 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c
1814 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c
1815 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1816 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
1817 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
1818 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1819 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
1820 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
1821 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1822 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
1823 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
1824 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
1825 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
1826 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1827 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
1828 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1
1829 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
1830 ; GFX10-DL-NEXT: s_endpgm
1831 <8 x i4> addrspace(1)* %src2,
1832 i32 addrspace(1)* nocapture %dst) {
1834 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1835 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1836 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1837 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1838 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1840 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1841 %cv1e0 = sext i4 %v1e0 to i32
1842 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1843 %cv2e0 = sext i4 %v2e0 to i32
1844 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1846 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1847 %cv1e1 = sext i4 %v1e1 to i32
1848 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1849 %cv2e1 = sext i4 %v2e1 to i32
1850 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1852 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1853 %cv1e2 = sext i4 %v1e2 to i32
1854 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1855 %cv2e2 = sext i4 %v2e2 to i32
1856 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1858 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1859 %cv1e3 = sext i4 %v1e3 to i32
1860 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1861 %cv2e3 = sext i4 %v2e3 to i32
1862 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1864 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1865 %cv1e4 = sext i4 %v1e4 to i32
1866 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1867 %cv2e4 = sext i4 %v2e4 to i32
1868 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1870 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1871 %cv1e5 = sext i4 %v1e5 to i32
1872 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1873 %cv2e5 = sext i4 %v2e5 to i32
1874 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1876 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1877 %cv1e6 = sext i4 %v1e6 to i32
1878 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1879 %cv2e6 = sext i4 %v2e6 to i32
1880 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1882 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1883 %cv1e7 = sext i4 %v1e7 to i32
1884 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1885 %cv2e7 = sext i4 %v2e7 to i32
1886 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1888 %acc = load i32, i32 addrspace(1)* %dst, align 4
1889 %add = add i32 %mul0, %acc
1890 %add1 = add i32 %mul0, %add
1891 %add2 = add i32 %add1, %mul1
1892 %add3 = add i32 %add2, %mul2
1893 %add4 = add i32 %add3, %mul3
1894 %add5 = add i32 %add4, %mul4
1895 %add6 = add i32 %add5, %mul5
1896 %add7 = add i32 %add6, %mul6
1897 %add8 = add i32 %add7, %mul7
1899 %res = add i32 %add, %add8
1900 store i32 %res, i32 addrspace(1)* %dst, align 4
1904 ; TODO: Support this pattern.
1905 define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1906 ; GFX7-LABEL: idot8_acc32_vecMul:
1907 ; GFX7: ; %bb.0: ; %entry
1908 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1909 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1910 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1911 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1912 ; GFX7-NEXT: s_mov_b32 s14, -1
1913 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1914 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1915 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1916 ; GFX7-NEXT: s_mov_b32 s10, 0
1917 ; GFX7-NEXT: s_mov_b32 s11, s3
1918 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1919 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1920 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1921 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1922 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1923 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1924 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1925 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1926 ; GFX7-NEXT: s_mov_b32 s2, -1
1927 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1928 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1929 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2
1930 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
1931 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
1932 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
1933 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
1934 ; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4
1935 ; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4
1936 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
1937 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1938 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0
1939 ; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
1940 ; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
1941 ; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
1942 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
1943 ; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4
1944 ; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
1945 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
1946 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1947 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
1948 ; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
1949 ; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
1950 ; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
1951 ; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0
1952 ; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0
1953 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0
1954 ; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0
1955 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1956 ; GFX7-NEXT: s_endpgm
1958 ; GFX8-LABEL: idot8_acc32_vecMul:
1959 ; GFX8: ; %bb.0: ; %entry
1960 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1961 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1962 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1963 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1964 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1965 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1966 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1967 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1968 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1969 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1970 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1971 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1972 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1973 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1974 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1975 ; GFX8-NEXT: s_mov_b32 s10, -1
1976 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1977 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1978 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1979 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1980 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3
1981 ; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4
1982 ; GFX8-NEXT: v_bfe_i32 v4, v3, 20, 4
1983 ; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 4
1984 ; GFX8-NEXT: v_bfe_i32 v6, v3, 12, 4
1985 ; GFX8-NEXT: v_bfe_i32 v7, v3, 8, 4
1986 ; GFX8-NEXT: v_bfe_i32 v8, v3, 4, 4
1987 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 4
1988 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1989 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v0
1990 ; GFX8-NEXT: v_bfe_i32 v10, v0, 24, 4
1991 ; GFX8-NEXT: v_bfe_i32 v11, v0, 20, 4
1992 ; GFX8-NEXT: v_bfe_i32 v12, v0, 16, 4
1993 ; GFX8-NEXT: v_bfe_i32 v13, v0, 12, 4
1994 ; GFX8-NEXT: v_bfe_i32 v14, v0, 8, 4
1995 ; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4
1996 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4
1997 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1998 ; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
1999 ; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0
2000 ; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0
2001 ; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0
2002 ; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0
2003 ; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0
2004 ; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0
2005 ; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0
2006 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2007 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2008 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2009 ; GFX8-NEXT: s_endpgm
2011 ; GFX9-LABEL: idot8_acc32_vecMul:
2012 ; GFX9: ; %bb.0: ; %entry
2013 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2014 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2015 ; GFX9-NEXT: s_mov_b32 s10, -1
2016 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2017 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2018 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2019 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2020 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2021 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2022 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2023 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2024 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2025 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
2026 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2027 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2028 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1
2029 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4
2030 ; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4
2031 ; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4
2032 ; GFX9-NEXT: v_bfe_i32 v7, v1, 12, 4
2033 ; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4
2034 ; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4
2035 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4
2036 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2037 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2
2038 ; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4
2039 ; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4
2040 ; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4
2041 ; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4
2042 ; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4
2043 ; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4
2044 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4
2045 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
2046 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16
2047 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15
2048 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14
2049 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2050 ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
2051 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13
2052 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12
2053 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
2054 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, v4, v11
2055 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10
2056 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
2057 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
2058 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
2059 ; GFX9-NEXT: s_endpgm
2061 ; GFX9-DL-LABEL: idot8_acc32_vecMul:
2062 ; GFX9-DL: ; %bb.0: ; %entry
2063 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2064 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2065 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2066 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2067 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2068 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2069 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2070 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2071 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2072 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2073 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2074 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2075 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
2076 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2077 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2078 ; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
2079 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
2080 ; GFX9-DL-NEXT: s_endpgm
2082 ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul:
2083 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
2084 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2085 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2086 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2087 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2088 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2089 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
2090 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
2091 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
2092 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
2093 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
2094 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
2095 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
2096 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
2097 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
2098 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
2099 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2100 ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
2101 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
2102 ; GFX10-DL-XNACK-NEXT: s_endpgm
2104 ; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
2105 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
2106 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2107 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2108 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2109 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2110 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
2111 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2112 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
2113 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
2114 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
2115 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
2116 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
2117 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
2118 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
2119 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
2120 ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
2121 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2122 ; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
2123 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
2124 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
2125 ; GFX10-DL-LABEL: idot8_acc32_vecMul:
2126 ; GFX10-DL: ; %bb.0: ; %entry
2127 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2128 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2129 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2130 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2131 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2132 ; GFX10-DL-NEXT: s_clause 0x1
2133 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
2134 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2135 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
2136 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2137 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2138 ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
2139 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
2140 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
2141 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2142 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
2143 ; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
2144 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
2145 ; GFX10-DL-NEXT: s_endpgm
2146 <8 x i4> addrspace(1)* %src2,
2147 i32 addrspace(1)* nocapture %dst) {
2149 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2150 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2151 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2152 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2153 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2155 %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
2156 %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
2158 %mul = mul <8 x i32> %cvec1, %cvec2
2159 %mul0 = extractelement <8 x i32> %mul, i64 0
2160 %mul1 = extractelement <8 x i32> %mul, i64 1
2161 %mul2 = extractelement <8 x i32> %mul, i64 2
2162 %mul3 = extractelement <8 x i32> %mul, i64 3
2163 %mul4 = extractelement <8 x i32> %mul, i64 4
2164 %mul5 = extractelement <8 x i32> %mul, i64 5
2165 %mul6 = extractelement <8 x i32> %mul, i64 6
2166 %mul7 = extractelement <8 x i32> %mul, i64 7
2168 %acc = load i32, i32 addrspace(1)* %dst, align 4
2169 %add1 = add i32 %mul0, %acc
2170 %add2 = add i32 %add1, %mul1
2171 %add3 = add i32 %add2, %mul2
2172 %add4 = add i32 %add3, %mul3
2173 %add5 = add i32 %add4, %mul4
2174 %add6 = add i32 %add5, %mul5
2175 %add7 = add i32 %add6, %mul6
2176 %add8 = add i32 %add7, %mul7
2178 store i32 %add8, i32 addrspace(1)* %dst, align 4
2182 ; TODO: Support this pattern.
2183 define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2184 ; GFX7-LABEL: idot8_acc16_vecMul:
2185 ; GFX7: ; %bb.0: ; %entry
2186 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2187 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2188 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2189 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2190 ; GFX7-NEXT: s_mov_b32 s14, -1
2191 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2192 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2193 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2194 ; GFX7-NEXT: s_mov_b32 s10, 0
2195 ; GFX7-NEXT: s_mov_b32 s11, s3
2196 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2197 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2198 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2199 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2200 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2201 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2202 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2203 ; GFX7-NEXT: s_mov_b32 s2, -1
2204 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
2205 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2206 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2207 ; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4
2208 ; GFX7-NEXT: v_bfe_i32 v6, v2, 4, 4
2209 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2210 ; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4
2211 ; GFX7-NEXT: v_bfe_i32 v13, v0, 4, 4
2212 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
2213 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
2214 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
2215 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
2216 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
2217 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
2218 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2219 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
2220 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
2221 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
2222 ; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
2223 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2
2224 ; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4
2225 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
2226 ; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
2227 ; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
2228 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
2229 ; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0
2230 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4
2231 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
2232 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
2233 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
2234 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2235 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
2236 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
2237 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
2238 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2239 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
2240 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
2241 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
2242 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
2243 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
2244 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2245 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
2246 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
2247 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2248 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
2249 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2250 ; GFX7-NEXT: s_endpgm
2252 ; GFX8-LABEL: idot8_acc16_vecMul:
2253 ; GFX8: ; %bb.0: ; %entry
2254 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2255 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2256 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2257 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
2258 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2259 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2260 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2261 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2262 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2263 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2264 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2265 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2266 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2267 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2268 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2269 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2270 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
2271 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2272 ; GFX8-NEXT: s_mov_b32 s10, -1
2273 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2274 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2275 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2276 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2277 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
2278 ; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2279 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3
2280 ; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2281 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3
2282 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3
2283 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3
2284 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3
2285 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2286 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2
2287 ; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2288 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2
2289 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2290 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2
2291 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2
2292 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2
2293 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2
2294 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
2295 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2296 ; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18
2297 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
2298 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
2299 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17
2300 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2301 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
2302 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2303 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2304 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
2305 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16
2306 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2307 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2308 ; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2
2309 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2310 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
2311 ; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2
2312 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
2313 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2314 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
2315 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2316 ; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2
2317 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2318 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2319 ; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2
2320 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
2321 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2322 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
2323 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2324 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
2325 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2326 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2327 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
2328 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
2329 ; GFX8-NEXT: flat_store_short v[0:1], v2
2330 ; GFX8-NEXT: s_endpgm
2332 ; GFX9-LABEL: idot8_acc16_vecMul:
2333 ; GFX9: ; %bb.0: ; %entry
2334 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2335 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2336 ; GFX9-NEXT: s_mov_b32 s10, -1
2337 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2338 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2339 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2340 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2341 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2342 ; GFX9-NEXT: v_mov_b32_e32 v4, 12
2343 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2344 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2345 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2346 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2347 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
2348 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2349 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2350 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1
2351 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v1
2352 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
2353 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
2354 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 20, v1
2355 ; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2356 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1
2357 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2358 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2
2359 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2360 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v2
2361 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2
2362 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2
2363 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 20, v2
2364 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2365 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2
2366 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2367 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v5
2368 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v6
2369 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v7
2370 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v8
2371 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v9
2372 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v10
2373 ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v11
2374 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v12
2375 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v13
2376 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
2377 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2378 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2379 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2380 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v14
2381 ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v15
2382 ; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2383 ; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12
2384 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2385 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2386 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2387 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2388 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7
2389 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v16
2390 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17
2391 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v18
2392 ; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9
2393 ; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14
2394 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2395 ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
2396 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
2397 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
2398 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2399 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2400 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2401 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9
2402 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2403 ; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2
2404 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1
2405 ; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16
2406 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
2407 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2408 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10
2409 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2410 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
2411 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2412 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
2413 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2414 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
2415 ; GFX9-NEXT: s_endpgm
2417 ; GFX9-DL-LABEL: idot8_acc16_vecMul:
2418 ; GFX9-DL: ; %bb.0: ; %entry
2419 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2420 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2421 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2422 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2423 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2424 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2425 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2426 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2427 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
2428 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2429 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
2430 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
2431 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2432 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
2433 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2434 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2435 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1
2436 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v1
2437 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
2438 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
2439 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1
2440 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2441 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1
2442 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2443 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2
2444 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2445 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v2
2446 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2
2447 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2
2448 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v2
2449 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2450 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2
2451 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2452 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v5
2453 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v6
2454 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v7
2455 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v8
2456 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v9
2457 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v10
2458 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v11
2459 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v12
2460 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v13
2461 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
2462 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2463 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2464 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2465 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v14
2466 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v15
2467 ; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2468 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12
2469 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2470 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2471 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2472 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2473 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
2474 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v16
2475 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
2476 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v18
2477 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
2478 ; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14
2479 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2480 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
2481 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
2482 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
2483 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2484 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2485 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2486 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9
2487 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2488 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2
2489 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1
2490 ; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16
2491 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
2492 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2493 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10
2494 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2495 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
2496 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2497 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
2498 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2499 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
2500 ; GFX9-DL-NEXT: s_endpgm
2502 ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
2503 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
2504 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2505 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2506 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2507 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2508 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2509 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
2510 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
2511 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
2512 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
2513 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
2514 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
2515 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
2516 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
2517 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
2518 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
2519 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
2520 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1
2521 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1
2522 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
2523 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2
2524 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2
2525 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1
2526 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2
2527 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
2528 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
2529 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
2530 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
2531 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1
2532 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2
2533 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
2534 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
2535 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
2536 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
2537 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12
2538 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5
2539 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2540 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2
2541 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
2542 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
2543 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
2544 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
2545 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12
2546 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2547 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1
2548 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2
2549 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
2550 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
2551 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
2552 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13
2553 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14
2554 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7
2555 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11
2556 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1
2557 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
2558 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2
2559 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2
2560 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
2561 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
2562 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
2563 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16
2564 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12
2565 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2566 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4
2567 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
2568 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3
2569 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
2570 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
2571 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15
2572 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
2573 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13
2574 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9
2575 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5
2576 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7
2577 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
2578 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
2579 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
2580 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
2581 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4
2582 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9
2583 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5
2584 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5
2585 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
2586 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17
2587 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2
2588 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1
2589 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4
2590 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7
2591 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
2592 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1
2593 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2594 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4
2595 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2
2596 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5
2597 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2598 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1
2599 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
2600 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
2601 ; GFX10-DL-XNACK-NEXT: s_endpgm
2603 ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
2604 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
2605 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2606 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2607 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2608 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
2609 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2610 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2611 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
2612 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
2613 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
2614 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
2615 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
2616 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
2617 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
2618 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
2619 ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
2620 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
2621 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1
2622 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1
2623 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
2624 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0
2625 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0
2626 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1
2627 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0
2628 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
2629 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
2630 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
2631 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12
2632 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1
2633 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0
2634 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
2635 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
2636 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
2637 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
2638 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12
2639 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5
2640 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2641 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0
2642 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
2643 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
2644 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
2645 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
2646 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12
2647 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5
2648 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1
2649 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0
2650 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
2651 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
2652 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
2653 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13
2654 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14
2655 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7
2656 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11
2657 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1
2658 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
2659 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0
2660 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0
2661 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
2662 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
2663 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
2664 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16
2665 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12
2666 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7
2667 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4
2668 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
2669 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3
2670 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
2671 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
2672 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15
2673 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
2674 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13
2675 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9
2676 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5
2677 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7
2678 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
2679 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
2680 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
2681 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
2682 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4
2683 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9
2684 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5
2685 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5
2686 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
2687 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17
2688 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0
2689 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1
2690 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4
2691 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7
2692 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0
2693 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1
2694 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2695 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4
2696 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0
2697 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5
2698 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2699 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
2700 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
2701 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
2702 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
2703 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
2704 ; GFX10-DL: ; %bb.0: ; %entry
2705 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
2706 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2707 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2708 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2709 ; GFX10-DL-NEXT: s_mov_b32 s14, -1
2710 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
2711 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
2712 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2713 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
2714 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2715 ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
2716 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
2717 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
2718 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2719 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
2720 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28
2721 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
2722 ; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014
2723 ; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008
2724 ; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c
2725 ; GFX10-DL-NEXT: s_and_b32 s10, s0, 15
2726 ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004
2727 ; GFX10-DL-NEXT: s_and_b32 s11, s1, 15
2728 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0
2729 ; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004
2730 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
2731 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10
2732 ; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c
2733 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
2734 ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008
2735 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2736 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9
2737 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11
2738 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2739 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
2740 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
2741 ; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010
2742 ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014
2743 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
2744 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
2745 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
2746 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7
2747 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0
2748 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
2749 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2750 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
2751 ; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
2752 ; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28
2753 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3
2754 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2755 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0
2756 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2757 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2758 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
2759 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2760 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
2761 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
2762 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2763 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4
2764 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2765 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2766 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
2767 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
2768 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
2769 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2770 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3
2771 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2772 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
2773 ; GFX10-DL-NEXT: s_endpgm
2774 <8 x i4> addrspace(1)* %src2,
2775 i16 addrspace(1)* nocapture %dst) {
2777 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2778 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2779 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2780 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2781 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2783 %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
2784 %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
2786 %mul = mul <8 x i16> %cvec1, %cvec2
2787 %mul0 = extractelement <8 x i16> %mul, i64 0
2788 %mul1 = extractelement <8 x i16> %mul, i64 1
2789 %mul2 = extractelement <8 x i16> %mul, i64 2
2790 %mul3 = extractelement <8 x i16> %mul, i64 3
2791 %mul4 = extractelement <8 x i16> %mul, i64 4
2792 %mul5 = extractelement <8 x i16> %mul, i64 5
2793 %mul6 = extractelement <8 x i16> %mul, i64 6
2794 %mul7 = extractelement <8 x i16> %mul, i64 7
2796 %acc = load i16, i16 addrspace(1)* %dst, align 4
2797 %add1 = add i16 %mul0, %acc
2798 %add2 = add i16 %add1, %mul1
2799 %add3 = add i16 %add2, %mul2
2800 %add4 = add i16 %add3, %mul3
2801 %add5 = add i16 %add4, %mul4
2802 %add6 = add i16 %add5, %mul5
2803 %add7 = add i16 %add6, %mul6
2804 %add8 = add i16 %add7, %mul7
2806 store i16 %add8, i16 addrspace(1)* %dst, align 4
2810 ; TODO: Support this pattern.
2811 define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2812 ; GFX7-LABEL: idot8_acc8_vecMul:
2813 ; GFX7: ; %bb.0: ; %entry
2814 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2815 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2816 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2817 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2818 ; GFX7-NEXT: s_mov_b32 s14, -1
2819 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2820 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2821 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2822 ; GFX7-NEXT: s_mov_b32 s10, 0
2823 ; GFX7-NEXT: s_mov_b32 s11, s3
2824 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2825 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2826 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2827 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2828 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2829 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2830 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2831 ; GFX7-NEXT: s_mov_b32 s2, -1
2832 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
2833 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2834 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2835 ; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
2836 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
2837 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2838 ; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4
2839 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
2840 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
2841 ; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4
2842 ; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2
2843 ; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4
2844 ; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4
2845 ; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
2846 ; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
2847 ; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
2848 ; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
2849 ; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4
2850 ; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0
2851 ; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
2852 ; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4
2853 ; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
2854 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
2855 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
2856 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2857 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
2858 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
2859 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9
2860 ; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
2861 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16
2862 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2863 ; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24
2864 ; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24
2865 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
2866 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
2867 ; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
2868 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
2869 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
2870 ; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
2871 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
2872 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
2873 ; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
2874 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
2875 ; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
2876 ; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
2877 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
2878 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
2879 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2880 ; GFX7-NEXT: s_endpgm
2882 ; GFX8-LABEL: idot8_acc8_vecMul:
2883 ; GFX8: ; %bb.0: ; %entry
2884 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2885 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2886 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2887 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
2888 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2889 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2890 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2891 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2892 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2893 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2894 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2895 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2896 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2897 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2898 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2899 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2900 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
2901 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2902 ; GFX8-NEXT: s_mov_b32 s10, -1
2903 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2904 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2905 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2906 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2907 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
2908 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
2909 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
2910 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
2911 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
2912 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
2913 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2914 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2
2915 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
2916 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
2917 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
2918 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
2919 ; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2920 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2921 ; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2
2922 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2923 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2924 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10
2925 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16
2926 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
2927 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
2928 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3
2929 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6
2930 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15
2931 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18
2932 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
2933 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
2934 ; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2
2935 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11
2936 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
2937 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
2938 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
2939 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
2940 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2941 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2942 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2943 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
2944 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2945 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2946 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14
2947 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2948 ; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15
2949 ; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18
2950 ; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2951 ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2952 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2953 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2954 ; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19
2955 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2956 ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11
2957 ; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2958 ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2959 ; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2960 ; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2961 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
2962 ; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2963 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
2964 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
2965 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2966 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
2967 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3
2968 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
2969 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
2970 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2971 ; GFX8-NEXT: v_add_u16_e32 v3, v8, v4
2972 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
2973 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
2974 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
2975 ; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2
2976 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v6
2977 ; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
2978 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v10
2979 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2980 ; GFX8-NEXT: s_endpgm
2982 ; GFX9-LABEL: idot8_acc8_vecMul:
2983 ; GFX9: ; %bb.0: ; %entry
2984 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2985 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2986 ; GFX9-NEXT: s_mov_b32 s10, -1
2987 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2988 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2989 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2990 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2991 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2992 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2993 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2994 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2995 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2996 ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]
2997 ; GFX9-NEXT: v_mov_b32_e32 v0, 12
2998 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2999 ; GFX9-NEXT: s_waitcnt vmcnt(2)
3000 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1
3001 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3002 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
3003 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
3004 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
3005 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3006 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2
3007 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
3008 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
3009 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
3010 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
3011 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
3012 ; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3013 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3014 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2
3015 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3016 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3017 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9
3018 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15
3019 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
3020 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
3021 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
3022 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
3023 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1
3024 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5
3025 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14
3026 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17
3027 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
3028 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
3029 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
3030 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
3031 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0
3032 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10
3033 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
3034 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
3035 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
3036 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
3037 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13
3038 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
3039 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
3040 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0
3041 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
3042 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
3043 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18
3044 ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17
3045 ; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3046 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3047 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3048 ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10
3049 ; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3050 ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3051 ; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3052 ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3053 ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14
3054 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
3055 ; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3056 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
3057 ; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3058 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3059 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
3060 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
3061 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2
3062 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3063 ; GFX9-NEXT: v_add_u16_e32 v2, v7, v4
3064 ; GFX9-NEXT: v_add_u16_e32 v1, v2, v1
3065 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
3066 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
3067 ; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
3068 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v5
3069 ; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0
3070 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
3071 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
3072 ; GFX9-NEXT: s_endpgm
3074 ; GFX9-DL-LABEL: idot8_acc8_vecMul:
3075 ; GFX9-DL: ; %bb.0: ; %entry
3076 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3077 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3078 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
3079 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
3080 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
3081 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3082 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3083 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3084 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
3085 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3086 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
3087 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
3088 ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]
3089 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12
3090 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
3091 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
3092 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1
3093 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3094 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
3095 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
3096 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
3097 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3098 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2
3099 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
3100 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
3101 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
3102 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
3103 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
3104 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3105 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3106 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2
3107 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3108 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3109 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9
3110 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15
3111 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
3112 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
3113 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
3114 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
3115 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1
3116 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5
3117 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14
3118 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17
3119 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
3120 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
3121 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
3122 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
3123 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0
3124 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10
3125 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
3126 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
3127 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
3128 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
3129 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13
3130 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
3131 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
3132 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0
3133 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
3134 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
3135 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18
3136 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17
3137 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3138 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3139 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3140 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10
3141 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3142 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3143 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3144 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3145 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14
3146 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
3147 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3148 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
3149 ; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3150 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3151 ; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0
3152 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
3153 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2
3154 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3155 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4
3156 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1
3157 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
3158 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
3159 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
3160 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5
3161 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0
3162 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
3163 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
3164 ; GFX9-DL-NEXT: s_endpgm
3166 ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul:
3167 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
3168 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3169 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3170 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3171 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
3172 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3173 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3174 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
3175 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
3176 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
3177 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
3178 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
3179 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
3180 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
3181 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
3182 ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
3183 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
3184 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
3185 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
3186 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
3187 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
3188 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2
3189 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
3190 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
3191 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
3192 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2
3193 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
3194 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
3195 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
3196 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15
3197 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1
3198 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3199 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2
3200 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2
3201 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
3202 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
3203 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
3204 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
3205 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15
3206 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3207 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
3208 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2
3209 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2
3210 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
3211 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
3212 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
3213 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0
3214 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
3215 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
3216 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
3217 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17
3218 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16
3219 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8
3220 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
3221 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
3222 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
3223 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
3224 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
3225 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
3226 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
3227 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0
3228 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
3229 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
3230 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15
3231 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3232 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
3233 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
3234 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
3235 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
3236 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2
3237 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11
3238 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
3239 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10
3240 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
3241 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12
3242 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14
3243 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2
3244 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6
3245 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3246 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3247 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3248 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3249 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13
3250 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
3251 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3
3252 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3253 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10
3254 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
3255 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
3256 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8
3257 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2
3258 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0
3259 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
3260 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
3261 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
3262 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
3263 ; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1]
3264 ; GFX10-DL-XNACK-NEXT: s_endpgm
3266 ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
3267 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
3268 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3269 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3270 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3271 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
3272 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3273 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3274 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
3275 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
3276 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
3277 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
3278 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
3279 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
3280 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
3281 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
3282 ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
3283 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
3284 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
3285 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
3286 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
3287 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
3288 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0
3289 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
3290 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
3291 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
3292 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0
3293 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
3294 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
3295 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
3296 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15
3297 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1
3298 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3299 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0
3300 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0
3301 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
3302 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
3303 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
3304 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
3305 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15
3306 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3307 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
3308 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0
3309 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0
3310 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
3311 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3
3312 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
3313 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
3314 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
3315 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
3316 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
3317 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17
3318 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16
3319 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8
3320 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
3321 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
3322 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
3323 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
3324 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
3325 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3
3326 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
3327 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
3328 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
3329 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
3330 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15
3331 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3332 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
3333 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
3334 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
3335 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12
3336 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11
3337 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13
3338 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0
3339 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10
3340 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
3341 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12
3342 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14
3343 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3
3344 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6
3345 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3346 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3347 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3348 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3349 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13
3350 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
3351 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2
3352 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3353 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10
3354 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
3355 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
3356 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8
3357 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2
3358 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0
3359 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
3360 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
3361 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
3362 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
3363 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1]
3364 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
3365 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
3366 ; GFX10-DL: ; %bb.0: ; %entry
3367 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
3368 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3369 ; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
3370 ; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
3371 ; GFX10-DL-NEXT: s_mov_b32 s22, -1
3372 ; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000
3373 ; GFX10-DL-NEXT: s_add_u32 s20, s20, s3
3374 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3375 ; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0
3376 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3377 ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
3378 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
3379 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
3380 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
3381 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3382 ; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4
3383 ; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4
3384 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9
3385 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16
3386 ; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12
3387 ; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12
3388 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0
3389 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1
3390 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17
3391 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10
3392 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
3393 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
3394 ; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8
3395 ; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8
3396 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
3397 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18
3398 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
3399 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
3400 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5
3401 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12
3402 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
3403 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
3404 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
3405 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3
3406 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6
3407 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13
3408 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20
3409 ; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16
3410 ; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28
3411 ; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24
3412 ; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20
3413 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3414 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8
3415 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7
3416 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
3417 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3
3418 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12
3419 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11
3420 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
3421 ; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16
3422 ; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28
3423 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13
3424 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7
3425 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8
3426 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9
3427 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3428 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
3429 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14
3430 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10
3431 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12
3432 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24
3433 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
3434 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15
3435 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15
3436 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3
3437 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9
3438 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6
3439 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10
3440 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14
3441 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3
3442 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3443 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
3444 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4
3445 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11
3446 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7
3447 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8
3448 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3449 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3450 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
3451 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
3452 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3453 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4
3454 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
3455 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3
3456 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
3457 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3458 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3459 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
3460 ; GFX10-DL-NEXT: s_endpgm
3461 <8 x i4> addrspace(1)* %src2,
3462 i8 addrspace(1)* nocapture %dst) {
3464 %idx = call i32 @llvm.amdgcn.workitem.id.x()
3465 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3466 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3467 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3468 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3470 %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
3471 %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
3473 %mul = mul <8 x i8> %cvec1, %cvec2
3474 %mul0 = extractelement <8 x i8> %mul, i64 0
3475 %mul1 = extractelement <8 x i8> %mul, i64 1
3476 %mul2 = extractelement <8 x i8> %mul, i64 2
3477 %mul3 = extractelement <8 x i8> %mul, i64 3
3478 %mul4 = extractelement <8 x i8> %mul, i64 4
3479 %mul5 = extractelement <8 x i8> %mul, i64 5
3480 %mul6 = extractelement <8 x i8> %mul, i64 6
3481 %mul7 = extractelement <8 x i8> %mul, i64 7
3483 %acc = load i8, i8 addrspace(1)* %dst, align 4
3484 %add1 = add i8 %mul0, %acc
3485 %add2 = add i8 %add1, %mul1
3486 %add3 = add i8 %add2, %mul2
3487 %add4 = add i8 %add3, %mul3
3488 %add5 = add i8 %add4, %mul4
3489 %add6 = add i8 %add5, %mul5
3490 %add7 = add i8 %add6, %mul6
3491 %add8 = add i8 %add7, %mul7
3493 store i8 %add8, i8 addrspace(1)* %dst, align 4
3497 declare i32 @llvm.amdgcn.workitem.id.x()