1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
11 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
12 ; GFX7-LABEL: idot8_acc32:
13 ; GFX7: ; %bb.0: ; %entry
14 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
15 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
16 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
17 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
18 ; GFX7-NEXT: s_mov_b32 s14, -1
19 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
20 ; GFX7-NEXT: s_add_u32 s12, s12, s3
21 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
22 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
23 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
24 ; GFX7-NEXT: s_mov_b32 s10, 0
25 ; GFX7-NEXT: s_mov_b32 s11, s3
26 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
27 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
28 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
29 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
30 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
31 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
32 ; GFX7-NEXT: s_mov_b32 s2, -1
33 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
34 ; GFX7-NEXT: s_waitcnt vmcnt(1)
35 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
36 ; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
37 ; GFX7-NEXT: s_waitcnt vmcnt(0)
38 ; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
39 ; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
40 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
42 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
43 ; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
44 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
45 ; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
46 ; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
47 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
48 ; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
49 ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
50 ; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
51 ; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
52 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
53 ; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
54 ; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
55 ; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
56 ; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
57 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
58 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
59 ; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
60 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
61 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
64 ; GFX8-LABEL: idot8_acc32:
65 ; GFX8: ; %bb.0: ; %entry
66 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
67 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
68 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
69 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
70 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
71 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
72 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
73 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
74 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
75 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
76 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
77 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
78 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
79 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
80 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
81 ; GFX8-NEXT: s_mov_b32 s10, -1
82 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
83 ; GFX8-NEXT: s_add_u32 s8, s8, s3
84 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
85 ; GFX8-NEXT: s_waitcnt vmcnt(1)
86 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
87 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
88 ; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
89 ; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
90 ; GFX8-NEXT: s_waitcnt vmcnt(0)
91 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
92 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
93 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
95 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
96 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
97 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
98 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
99 ; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
100 ; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
101 ; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
102 ; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
103 ; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
104 ; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
105 ; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
106 ; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
107 ; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
108 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
109 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
110 ; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
111 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
112 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
113 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
114 ; GFX8-NEXT: flat_store_dword v[0:1], v2
115 ; GFX8-NEXT: s_endpgm
117 ; GFX9-LABEL: idot8_acc32:
118 ; GFX9: ; %bb.0: ; %entry
119 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
120 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
121 ; GFX9-NEXT: s_mov_b32 s10, -1
122 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
123 ; GFX9-NEXT: s_add_u32 s8, s8, s3
124 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
125 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
126 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
127 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
128 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
130 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
131 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
132 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
133 ; GFX9-NEXT: s_waitcnt vmcnt(1)
134 ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4
137 ; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4
138 ; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4
139 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4
140 ; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4
141 ; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4
142 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6
143 ; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4
144 ; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4
145 ; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4
146 ; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4
147 ; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4
148 ; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4
149 ; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4
150 ; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4
151 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
152 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2
153 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8
154 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10
155 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
156 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4
158 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12
159 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14
160 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6
161 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16
162 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8
163 ; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1
164 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
165 ; GFX9-NEXT: s_endpgm
167 ; GFX9-DL-LABEL: idot8_acc32:
168 ; GFX9-DL: ; %bb.0: ; %entry
169 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
170 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
171 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
172 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
173 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
174 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
175 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
176 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
177 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
178 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
179 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
180 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
181 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
182 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
183 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
184 ; GFX9-DL-NEXT: v_dot8_i32_i4 v0, v2, v3, s0
185 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
186 ; GFX9-DL-NEXT: s_endpgm
188 ; GFX10-DL-XNACK-LABEL: idot8_acc32:
189 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
190 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
191 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
192 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
193 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
194 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
195 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
196 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
197 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
198 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
199 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
201 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
202 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
203 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
204 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
205 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
206 ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
207 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
208 ; GFX10-DL-XNACK-NEXT: s_endpgm
210 ; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
211 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
212 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
213 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
214 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
215 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
216 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
217 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
218 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
219 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
220 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
221 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
222 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
224 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
225 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
226 ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
227 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
228 ; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
229 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
230 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
231 ; GFX10-DL-LABEL: idot8_acc32:
232 ; GFX10-DL: ; %bb.0: ; %entry
233 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
234 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
235 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
236 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
237 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
238 ; GFX10-DL-NEXT: s_clause 0x1
239 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
240 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
241 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
242 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
243 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
245 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
246 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
247 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
249 ; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
250 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
251 ; GFX10-DL-NEXT: s_endpgm
252 <8 x i4> addrspace(1)* %src2,
253 i32 addrspace(1)* nocapture %dst) {
255 %idx = call i32 @llvm.amdgcn.workitem.id.x()
256 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
257 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
258 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
259 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
261 %v1e0 = extractelement <8 x i4> %vec1, i64 0
262 %cv1e0 = sext i4 %v1e0 to i32
263 %v2e0 = extractelement <8 x i4> %vec2, i64 0
264 %cv2e0 = sext i4 %v2e0 to i32
265 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
267 %v1e1 = extractelement <8 x i4> %vec1, i64 1
268 %cv1e1 = sext i4 %v1e1 to i32
269 %v2e1 = extractelement <8 x i4> %vec2, i64 1
270 %cv2e1 = sext i4 %v2e1 to i32
271 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
273 %v1e2 = extractelement <8 x i4> %vec1, i64 2
274 %cv1e2 = sext i4 %v1e2 to i32
275 %v2e2 = extractelement <8 x i4> %vec2, i64 2
276 %cv2e2 = sext i4 %v2e2 to i32
277 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
279 %v1e3 = extractelement <8 x i4> %vec1, i64 3
280 %cv1e3 = sext i4 %v1e3 to i32
281 %v2e3 = extractelement <8 x i4> %vec2, i64 3
282 %cv2e3 = sext i4 %v2e3 to i32
283 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
285 %v1e4 = extractelement <8 x i4> %vec1, i64 4
286 %cv1e4 = sext i4 %v1e4 to i32
287 %v2e4 = extractelement <8 x i4> %vec2, i64 4
288 %cv2e4 = sext i4 %v2e4 to i32
289 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
291 %v1e5 = extractelement <8 x i4> %vec1, i64 5
292 %cv1e5 = sext i4 %v1e5 to i32
293 %v2e5 = extractelement <8 x i4> %vec2, i64 5
294 %cv2e5 = sext i4 %v2e5 to i32
295 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
297 %v1e6 = extractelement <8 x i4> %vec1, i64 6
298 %cv1e6 = sext i4 %v1e6 to i32
299 %v2e6 = extractelement <8 x i4> %vec2, i64 6
300 %cv2e6 = sext i4 %v2e6 to i32
301 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
303 %v1e7 = extractelement <8 x i4> %vec1, i64 7
304 %cv1e7 = sext i4 %v1e7 to i32
305 %v2e7 = extractelement <8 x i4> %vec2, i64 7
306 %cv2e7 = sext i4 %v2e7 to i32
307 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
309 %acc = load i32, i32 addrspace(1)* %dst, align 4
310 %add1 = add i32 %mul0, %acc
311 %add2 = add i32 %add1, %mul1
312 %add3 = add i32 %add2, %mul2
313 %add4 = add i32 %add3, %mul3
314 %add5 = add i32 %add4, %mul4
315 %add6 = add i32 %add5, %mul5
316 %add7 = add i32 %add6, %mul6
317 %add8 = add i32 %add7, %mul7
319 store i32 %add8, i32 addrspace(1)* %dst, align 4
323 ; TODO: Once the unnecessary zero extentions of the elements are removed;
324 ; pattern recognizer will kick in.
325 define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
326 ; GFX7-LABEL: idot8_acc16:
327 ; GFX7: ; %bb.0: ; %entry
328 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
329 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
330 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
331 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
332 ; GFX7-NEXT: s_mov_b32 s14, -1
333 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
334 ; GFX7-NEXT: s_add_u32 s12, s12, s3
335 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
336 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
338 ; GFX7-NEXT: s_mov_b32 s10, 0
339 ; GFX7-NEXT: s_mov_b32 s11, s3
340 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
341 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
342 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
343 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
344 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
345 ; GFX7-NEXT: s_mov_b32 s2, -1
346 ; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff
347 ; GFX7-NEXT: s_mov_b32 s4, 0xffff
348 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
349 ; GFX7-NEXT: s_waitcnt vmcnt(1)
350 ; GFX7-NEXT: v_bfe_i32 v1, v3, 0, 4
351 ; GFX7-NEXT: v_bfe_i32 v4, v3, 4, 4
352 ; GFX7-NEXT: v_bfe_i32 v5, v3, 8, 4
353 ; GFX7-NEXT: v_bfe_i32 v6, v3, 12, 4
354 ; GFX7-NEXT: v_bfe_i32 v7, v3, 16, 4
355 ; GFX7-NEXT: v_bfe_i32 v8, v3, 20, 4
356 ; GFX7-NEXT: v_bfe_i32 v9, v3, 24, 4
357 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v3
358 ; GFX7-NEXT: s_waitcnt vmcnt(0)
359 ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
360 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
361 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
362 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
363 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
364 ; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
365 ; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
366 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
367 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v9
368 ; GFX7-NEXT: v_and_b32_e32 v3, v2, v3
369 ; GFX7-NEXT: v_and_b32_e32 v15, v2, v15
370 ; GFX7-NEXT: v_and_b32_e32 v16, v2, v16
371 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0
372 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
373 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
374 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10
375 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
376 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v11
377 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
378 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v12
379 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
380 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v13
381 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
382 ; GFX7-NEXT: v_and_b32_e32 v14, s4, v14
383 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
384 ; GFX7-NEXT: s_waitcnt vmcnt(0)
385 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v2
386 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
387 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
388 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
389 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
390 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
391 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
392 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1
393 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
394 ; GFX7-NEXT: s_endpgm
396 ; GFX8-LABEL: idot8_acc16:
397 ; GFX8: ; %bb.0: ; %entry
398 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
399 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
400 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
401 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
402 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
403 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
404 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
405 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
406 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
407 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
408 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
409 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
410 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
411 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
412 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
413 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
414 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
415 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
416 ; GFX8-NEXT: s_mov_b32 s10, -1
417 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
418 ; GFX8-NEXT: s_add_u32 s8, s8, s3
419 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
420 ; GFX8-NEXT: s_waitcnt vmcnt(2)
421 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
422 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
423 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
424 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
425 ; GFX8-NEXT: s_waitcnt vmcnt(1)
426 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
427 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2
428 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
429 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
430 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
431 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
432 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
433 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
434 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
435 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
436 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
437 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
438 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
439 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
440 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16
441 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
442 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
443 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
444 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
445 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
446 ; GFX8-NEXT: s_waitcnt vmcnt(0)
447 ; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4
448 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
449 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
450 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
451 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
452 ; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4
453 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
454 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
455 ; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4
456 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
457 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
458 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18
459 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
460 ; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4
461 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
462 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
463 ; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4
464 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
465 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
466 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
467 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
468 ; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4
469 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
470 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
471 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
472 ; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
473 ; GFX8-NEXT: flat_store_short v[0:1], v2
474 ; GFX8-NEXT: s_endpgm
476 ; GFX9-LABEL: idot8_acc16:
477 ; GFX9: ; %bb.0: ; %entry
478 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
479 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
480 ; GFX9-NEXT: s_mov_b32 s10, -1
481 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
482 ; GFX9-NEXT: s_add_u32 s8, s8, s3
483 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
484 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
485 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
486 ; GFX9-NEXT: v_mov_b32_e32 v4, 12
487 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
488 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
490 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
491 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
492 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3]
493 ; GFX9-NEXT: s_waitcnt vmcnt(2)
494 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
495 ; GFX9-NEXT: s_waitcnt vmcnt(1)
496 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
497 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
498 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2
499 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
500 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
501 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
502 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
503 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
504 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
505 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
506 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
507 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9
508 ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14
509 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
510 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
511 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
512 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
513 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15
514 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
515 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
516 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
517 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
518 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
519 ; GFX9-NEXT: s_waitcnt vmcnt(0)
520 ; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
521 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
522 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
523 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
524 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
525 ; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
526 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
527 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
528 ; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
529 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
530 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
531 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17
532 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
533 ; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
534 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
535 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
536 ; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
537 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5
538 ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10
539 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
540 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
541 ; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
542 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
543 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
544 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
545 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
546 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
547 ; GFX9-NEXT: s_endpgm
549 ; GFX9-DL-LABEL: idot8_acc16:
550 ; GFX9-DL: ; %bb.0: ; %entry
551 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
552 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
553 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
554 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
555 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
556 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
557 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
558 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
559 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
560 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
561 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
563 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
564 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
565 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3]
566 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
567 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
568 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
569 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
570 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
571 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2
572 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
573 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
574 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
575 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
576 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
577 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
578 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
579 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
580 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9
581 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
582 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
583 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
584 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
585 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
586 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15
587 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
588 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
589 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
590 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
591 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
592 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
593 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
594 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
595 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
596 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
597 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
598 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
599 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
600 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
601 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
602 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
603 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
604 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
605 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
606 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
607 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
608 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
609 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
610 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5
611 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
612 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
613 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
614 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
615 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
616 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
617 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
618 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
619 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3]
620 ; GFX9-DL-NEXT: s_endpgm
622 ; GFX10-DL-XNACK-LABEL: idot8_acc16:
623 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
624 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
625 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
626 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
627 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
628 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
629 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
630 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
631 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
632 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
633 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
634 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
635 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
636 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
637 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
638 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
639 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
640 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
641 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
642 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
643 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
644 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
645 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
646 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
647 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
648 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
649 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
650 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
651 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
652 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
653 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
654 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
655 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
656 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
657 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
658 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
659 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
660 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
661 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
662 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
663 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
664 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
665 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
666 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3
667 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
668 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
669 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
670 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
671 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1
672 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
673 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
674 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
675 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
676 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
677 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
678 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
679 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
680 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
681 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1
682 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
683 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
684 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
685 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
686 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
687 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
688 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
689 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
690 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
691 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
692 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
693 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
694 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
695 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
696 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
697 ; GFX10-DL-XNACK-NEXT: s_endpgm
699 ; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
700 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
701 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
702 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
703 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
704 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
705 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
706 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
707 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
708 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
709 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
710 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
711 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
712 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
713 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
714 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
715 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
716 ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
717 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
718 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
719 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
720 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
721 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
722 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
723 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
724 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
725 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
726 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
727 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
728 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
729 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
730 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
731 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
732 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
733 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
734 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
735 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
736 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
737 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
738 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
739 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
740 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
741 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
742 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
743 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
744 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3
745 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
746 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
747 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
748 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
749 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
750 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
751 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
752 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
753 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
754 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
755 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
756 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
757 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
758 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
759 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0
760 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
761 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
762 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
763 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
764 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
765 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
766 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
767 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
768 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
769 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
770 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
771 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
772 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
773 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
774 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
775 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
776 ; GFX10-DL-LABEL: idot8_acc16:
777 ; GFX10-DL: ; %bb.0: ; %entry
778 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
779 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
780 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
781 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
782 ; GFX10-DL-NEXT: s_mov_b32 s14, -1
783 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
784 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
785 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
786 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
787 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
789 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
790 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
791 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
792 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
793 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12
794 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
795 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
796 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2
797 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3
798 ; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
799 ; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
800 ; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
801 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
802 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10
803 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
804 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
805 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
806 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
807 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1
808 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1
809 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
810 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
811 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
812 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
813 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
814 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1
815 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
816 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
817 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
818 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
819 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
820 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
821 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
822 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
823 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
824 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
825 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
826 ; GFX10-DL-NEXT: s_endpgm
827 <8 x i4> addrspace(1)* %src2,
828 i16 addrspace(1)* nocapture %dst) {
830 %idx = call i32 @llvm.amdgcn.workitem.id.x()
831 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
832 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
833 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
834 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
836 %v1e0 = extractelement <8 x i4> %vec1, i64 0
837 %cv1e0 = sext i4 %v1e0 to i16
838 %v2e0 = extractelement <8 x i4> %vec2, i64 0
839 %cv2e0 = sext i4 %v2e0 to i16
840 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
842 %v1e1 = extractelement <8 x i4> %vec1, i64 1
843 %cv1e1 = sext i4 %v1e1 to i16
844 %v2e1 = extractelement <8 x i4> %vec2, i64 1
845 %cv2e1 = sext i4 %v2e1 to i16
846 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
848 %v1e2 = extractelement <8 x i4> %vec1, i64 2
849 %cv1e2 = sext i4 %v1e2 to i16
850 %v2e2 = extractelement <8 x i4> %vec2, i64 2
851 %cv2e2 = sext i4 %v2e2 to i16
852 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
854 %v1e3 = extractelement <8 x i4> %vec1, i64 3
855 %cv1e3 = sext i4 %v1e3 to i16
856 %v2e3 = extractelement <8 x i4> %vec2, i64 3
857 %cv2e3 = sext i4 %v2e3 to i16
858 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
860 %v1e4 = extractelement <8 x i4> %vec1, i64 4
861 %cv1e4 = sext i4 %v1e4 to i16
862 %v2e4 = extractelement <8 x i4> %vec2, i64 4
863 %cv2e4 = sext i4 %v2e4 to i16
864 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
866 %v1e5 = extractelement <8 x i4> %vec1, i64 5
867 %cv1e5 = sext i4 %v1e5 to i16
868 %v2e5 = extractelement <8 x i4> %vec2, i64 5
869 %cv2e5 = sext i4 %v2e5 to i16
870 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
872 %v1e6 = extractelement <8 x i4> %vec1, i64 6
873 %cv1e6 = sext i4 %v1e6 to i16
874 %v2e6 = extractelement <8 x i4> %vec2, i64 6
875 %cv2e6 = sext i4 %v2e6 to i16
876 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
878 %v1e7 = extractelement <8 x i4> %vec1, i64 7
879 %cv1e7 = sext i4 %v1e7 to i16
880 %v2e7 = extractelement <8 x i4> %vec2, i64 7
881 %cv2e7 = sext i4 %v2e7 to i16
882 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
884 %acc = load i16, i16 addrspace(1)* %dst, align 4
885 %add1 = add i16 %mul0, %acc
886 %add2 = add i16 %add1, %mul1
887 %add3 = add i16 %add2, %mul2
888 %add4 = add i16 %add3, %mul3
889 %add5 = add i16 %add4, %mul4
890 %add6 = add i16 %add5, %mul5
891 %add7 = add i16 %add6, %mul6
892 %add8 = add i16 %add7, %mul7
894 store i16 %add8, i16 addrspace(1)* %dst, align 4
898 ; TODO: Support this pattern.
899 define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
900 ; GFX7-LABEL: idot8_acc8:
901 ; GFX7: ; %bb.0: ; %entry
902 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
903 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
904 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
905 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
906 ; GFX7-NEXT: s_mov_b32 s14, -1
907 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
908 ; GFX7-NEXT: s_add_u32 s12, s12, s3
909 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
910 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
911 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
912 ; GFX7-NEXT: s_mov_b32 s10, 0
913 ; GFX7-NEXT: s_mov_b32 s11, s3
914 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
915 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
916 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
917 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
918 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
919 ; GFX7-NEXT: s_mov_b32 s2, -1
920 ; GFX7-NEXT: v_mov_b32_e32 v2, 0xff
921 ; GFX7-NEXT: s_movk_i32 s4, 0xff
922 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
923 ; GFX7-NEXT: s_waitcnt vmcnt(1)
924 ; GFX7-NEXT: v_bfe_i32 v1, v3, 0, 4
925 ; GFX7-NEXT: v_bfe_i32 v4, v3, 4, 4
926 ; GFX7-NEXT: v_bfe_i32 v5, v3, 8, 4
927 ; GFX7-NEXT: v_bfe_i32 v6, v3, 12, 4
928 ; GFX7-NEXT: v_bfe_i32 v7, v3, 16, 4
929 ; GFX7-NEXT: v_bfe_i32 v8, v3, 20, 4
930 ; GFX7-NEXT: v_bfe_i32 v9, v3, 24, 4
931 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v3
932 ; GFX7-NEXT: s_waitcnt vmcnt(0)
933 ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
934 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
935 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
936 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
937 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
938 ; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
939 ; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
940 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
941 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v9
942 ; GFX7-NEXT: v_and_b32_e32 v3, v2, v3
943 ; GFX7-NEXT: v_and_b32_e32 v15, v2, v15
944 ; GFX7-NEXT: v_and_b32_e32 v16, v2, v16
945 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0
946 ; GFX7-NEXT: buffer_load_ubyte v2, off, s[0:3], 0
947 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
948 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10
949 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
950 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v11
951 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
952 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v12
953 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
954 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v13
955 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
956 ; GFX7-NEXT: v_and_b32_e32 v14, s4, v14
957 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
958 ; GFX7-NEXT: s_waitcnt vmcnt(0)
959 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v2
960 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
961 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
962 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
963 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
964 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
965 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
966 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1
967 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
968 ; GFX7-NEXT: s_endpgm
970 ; GFX8-LABEL: idot8_acc8:
971 ; GFX8: ; %bb.0: ; %entry
972 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
973 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
974 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
975 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
976 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
977 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
978 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
979 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
980 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
981 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
982 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
983 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
984 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
985 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
986 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
987 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
988 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
989 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
990 ; GFX8-NEXT: s_mov_b32 s10, -1
991 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
992 ; GFX8-NEXT: s_add_u32 s8, s8, s3
993 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
994 ; GFX8-NEXT: s_waitcnt vmcnt(2)
995 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
996 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
997 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
998 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
999 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1000 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
1001 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2
1002 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
1003 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
1004 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1005 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
1006 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
1007 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
1008 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
1009 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
1010 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1011 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1012 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1013 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1014 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16
1015 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
1016 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
1017 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
1018 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1019 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
1020 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1021 ; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4
1022 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
1023 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
1024 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1025 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1026 ; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4
1027 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1028 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1029 ; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4
1030 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
1031 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
1032 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18
1033 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
1034 ; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4
1035 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1036 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1037 ; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4
1038 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
1039 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
1040 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
1041 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
1042 ; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4
1043 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1044 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1045 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
1046 ; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
1047 ; GFX8-NEXT: flat_store_byte v[0:1], v2
1048 ; GFX8-NEXT: s_endpgm
1050 ; GFX9-LABEL: idot8_acc8:
1051 ; GFX9: ; %bb.0: ; %entry
1052 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1053 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1054 ; GFX9-NEXT: s_mov_b32 s10, -1
1055 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1056 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1057 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1058 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1059 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1060 ; GFX9-NEXT: v_mov_b32_e32 v4, 12
1061 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1062 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1063 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1064 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1065 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1066 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3]
1067 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1068 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
1069 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1070 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
1071 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
1072 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2
1073 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
1074 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1075 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
1076 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
1077 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1078 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
1079 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
1080 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
1081 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9
1082 ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14
1083 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1084 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1085 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1086 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1087 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15
1088 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
1089 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
1090 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
1091 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1092 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1093 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1094 ; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
1095 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
1096 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
1097 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1098 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1099 ; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
1100 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1101 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1102 ; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
1103 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
1104 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
1105 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17
1106 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
1107 ; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
1108 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1109 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1110 ; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
1111 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5
1112 ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10
1113 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
1114 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
1115 ; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
1116 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1117 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1118 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1119 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
1120 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
1121 ; GFX9-NEXT: s_endpgm
1123 ; GFX9-DL-LABEL: idot8_acc8:
1124 ; GFX9-DL: ; %bb.0: ; %entry
1125 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1126 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1127 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1128 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1129 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1130 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1131 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1132 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1133 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
1134 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1135 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1136 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1137 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1138 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1139 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
1140 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
1141 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
1142 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1143 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
1144 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
1145 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2
1146 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
1147 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1148 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
1149 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
1150 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
1151 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
1152 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
1153 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
1154 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9
1155 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
1156 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1157 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1158 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1159 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1160 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15
1161 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
1162 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
1163 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
1164 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
1165 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
1166 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1167 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
1168 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
1169 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
1170 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
1171 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
1172 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
1173 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
1174 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
1175 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
1176 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
1177 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
1178 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
1179 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
1180 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
1181 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
1182 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
1183 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
1184 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5
1185 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
1186 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
1187 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
1188 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
1189 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
1190 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
1191 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
1192 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
1193 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
1194 ; GFX9-DL-NEXT: s_endpgm
1196 ; GFX10-DL-XNACK-LABEL: idot8_acc8:
1197 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
1198 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1199 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1200 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1201 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1202 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1203 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
1204 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
1205 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
1206 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
1207 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
1208 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
1209 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
1210 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
1211 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
1212 ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1]
1213 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
1214 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1215 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1216 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1217 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1218 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
1219 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
1220 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
1221 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
1222 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
1223 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
1224 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
1225 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
1226 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
1227 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
1228 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
1229 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
1230 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1231 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
1232 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
1233 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
1234 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
1235 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
1236 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
1237 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
1238 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
1239 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
1240 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3
1241 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
1242 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
1243 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
1244 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
1245 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1
1246 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
1247 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
1248 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
1249 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
1250 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
1251 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
1252 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
1253 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
1254 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
1255 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1
1256 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
1257 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
1258 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
1259 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
1260 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
1261 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
1262 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
1263 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
1264 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
1265 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
1266 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
1267 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
1268 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
1269 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
1270 ; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
1271 ; GFX10-DL-XNACK-NEXT: s_endpgm
1273 ; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
1274 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
1275 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
1276 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1277 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1278 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1279 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
1280 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1281 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1282 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
1283 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
1284 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
1285 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
1286 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
1287 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
1288 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
1289 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
1290 ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1]
1291 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
1292 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
1293 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
1294 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1
1295 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1296 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
1297 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
1298 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
1299 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
1300 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
1301 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
1302 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
1303 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
1304 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
1305 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
1306 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
1307 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
1308 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
1309 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
1310 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
1311 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
1312 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
1313 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
1314 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
1315 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
1316 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
1317 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
1318 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3
1319 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
1320 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
1321 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
1322 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
1323 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
1324 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
1325 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
1326 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
1327 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
1328 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
1329 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
1330 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
1331 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
1332 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
1333 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0
1334 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
1335 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
1336 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
1337 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
1338 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
1339 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
1340 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
1341 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
1342 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
1343 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
1344 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
1345 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
1346 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
1347 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
1348 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
1349 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
1350 ; GFX10-DL-LABEL: idot8_acc8:
1351 ; GFX10-DL: ; %bb.0: ; %entry
1352 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1353 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1354 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1355 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1356 ; GFX10-DL-NEXT: s_mov_b32 s14, -1
1357 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
1358 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
1359 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1360 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
1361 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1362 ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
1363 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
1364 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
1365 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1366 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
1367 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12
1368 ; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
1369 ; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
1370 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2
1371 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3
1372 ; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
1373 ; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
1374 ; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
1375 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
1376 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10
1377 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
1378 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
1379 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
1380 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1381 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1
1382 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1
1383 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
1384 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
1385 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
1386 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1387 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
1388 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1
1389 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1390 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
1391 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
1392 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1393 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
1394 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
1395 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
1396 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
1397 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1398 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
1399 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
1400 ; GFX10-DL-NEXT: s_endpgm
1401 <8 x i4> addrspace(1)* %src2,
1402 i8 addrspace(1)* nocapture %dst) {
1404 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1405 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1406 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1407 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1408 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1410 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1411 %cv1e0 = sext i4 %v1e0 to i8
1412 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1413 %cv2e0 = sext i4 %v2e0 to i8
1414 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
1416 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1417 %cv1e1 = sext i4 %v1e1 to i8
1418 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1419 %cv2e1 = sext i4 %v2e1 to i8
1420 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
1422 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1423 %cv1e2 = sext i4 %v1e2 to i8
1424 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1425 %cv2e2 = sext i4 %v2e2 to i8
1426 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
1428 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1429 %cv1e3 = sext i4 %v1e3 to i8
1430 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1431 %cv2e3 = sext i4 %v2e3 to i8
1432 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
1434 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1435 %cv1e4 = sext i4 %v1e4 to i8
1436 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1437 %cv2e4 = sext i4 %v2e4 to i8
1438 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
1440 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1441 %cv1e5 = sext i4 %v1e5 to i8
1442 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1443 %cv2e5 = sext i4 %v2e5 to i8
1444 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
1446 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1447 %cv1e6 = sext i4 %v1e6 to i8
1448 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1449 %cv2e6 = sext i4 %v2e6 to i8
1450 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
1452 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1453 %cv1e7 = sext i4 %v1e7 to i8
1454 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1455 %cv2e7 = sext i4 %v2e7 to i8
1456 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
1458 %acc = load i8, i8 addrspace(1)* %dst, align 4
1459 %add1 = add i8 %mul0, %acc
1460 %add2 = add i8 %add1, %mul1
1461 %add3 = add i8 %add2, %mul2
1462 %add4 = add i8 %add3, %mul3
1463 %add5 = add i8 %add4, %mul4
1464 %add6 = add i8 %add5, %mul5
1465 %add7 = add i8 %add6, %mul6
1466 %add8 = add i8 %add7, %mul7
1468 store i8 %add8, i8 addrspace(1)* %dst, align 4
1472 ; Make sure the pattern is not recognized if there are multiple uses of the
1473 ; intermediate multiplications.
1474 define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1475 ; GFX7-LABEL: idot8_multiuses_mul1:
1476 ; GFX7: ; %bb.0: ; %entry
1477 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1478 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1479 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1480 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1481 ; GFX7-NEXT: s_mov_b32 s14, -1
1482 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1483 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1484 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1485 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1486 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1487 ; GFX7-NEXT: s_mov_b32 s10, 0
1488 ; GFX7-NEXT: s_mov_b32 s11, s3
1489 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1490 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1491 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1492 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1493 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1494 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1495 ; GFX7-NEXT: s_mov_b32 s2, -1
1496 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1497 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1498 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
1499 ; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
1500 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1501 ; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
1502 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1503 ; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
1504 ; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
1505 ; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
1506 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
1507 ; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
1508 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
1509 ; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
1510 ; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
1511 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
1512 ; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
1513 ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
1514 ; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
1515 ; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
1516 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
1517 ; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
1518 ; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
1519 ; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
1520 ; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
1521 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1522 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
1523 ; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
1524 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
1525 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0
1526 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1527 ; GFX7-NEXT: s_endpgm
1529 ; GFX8-LABEL: idot8_multiuses_mul1:
1530 ; GFX8: ; %bb.0: ; %entry
1531 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1532 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1533 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1534 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1535 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1536 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1537 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1538 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1539 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1540 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1541 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1542 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1543 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1544 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1545 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1546 ; GFX8-NEXT: s_mov_b32 s10, -1
1547 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1548 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1549 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1550 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1551 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
1552 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
1553 ; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4
1554 ; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4
1555 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1556 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
1557 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1558 ; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2
1559 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
1560 ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
1561 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
1562 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
1563 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
1564 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
1565 ; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4
1566 ; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4
1567 ; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1
1568 ; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4
1569 ; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4
1570 ; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1
1571 ; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4
1572 ; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4
1573 ; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1
1574 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3
1575 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
1576 ; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
1577 ; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
1578 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
1579 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1580 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1581 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1582 ; GFX8-NEXT: s_endpgm
1584 ; GFX9-LABEL: idot8_multiuses_mul1:
1585 ; GFX9: ; %bb.0: ; %entry
1586 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1587 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1588 ; GFX9-NEXT: s_mov_b32 s10, -1
1589 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
1590 ; GFX9-NEXT: s_add_u32 s8, s8, s3
1591 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1592 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1593 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1594 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
1595 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1596 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1597 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1598 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1599 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1600 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1601 ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
1602 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1603 ; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4
1604 ; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4
1605 ; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4
1606 ; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4
1607 ; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4
1608 ; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4
1609 ; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4
1610 ; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4
1611 ; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4
1612 ; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4
1613 ; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4
1614 ; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4
1615 ; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4
1616 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1617 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1618 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
1619 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1620 ; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0
1621 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6
1622 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8
1623 ; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2
1624 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v9, v10
1625 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v11, v12
1626 ; GFX9-NEXT: v_add3_u32 v3, v3, v5, v6
1627 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v13, v14
1628 ; GFX9-NEXT: v_mul_i32_i24_e32 v10, v15, v16
1629 ; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8
1630 ; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10
1631 ; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2
1632 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
1633 ; GFX9-NEXT: s_endpgm
1635 ; GFX9-DL-LABEL: idot8_multiuses_mul1:
1636 ; GFX9-DL: ; %bb.0: ; %entry
1637 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1638 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1639 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
1640 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
1641 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
1642 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1643 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1644 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1645 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
1646 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1647 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
1648 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
1649 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
1650 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1651 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
1652 ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4
1653 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1654 ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 4
1655 ; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 4, 4
1656 ; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 4, 4
1657 ; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 8, 4
1658 ; GFX9-DL-NEXT: v_bfe_i32 v8, v2, 8, 4
1659 ; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 12, 4
1660 ; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 12, 4
1661 ; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4
1662 ; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4
1663 ; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4
1664 ; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4
1665 ; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4
1666 ; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4
1667 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1668 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1669 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2
1670 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1671 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0
1672 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6
1673 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8
1674 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2
1675 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v9, v10
1676 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v11, v12
1677 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v5, v6
1678 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v13, v14
1679 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v10, v15, v16
1680 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8
1681 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10
1682 ; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2
1683 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
1684 ; GFX9-DL-NEXT: s_endpgm
1686 ; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1:
1687 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
1688 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1689 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1690 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1691 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1692 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1693 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
1694 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
1695 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
1696 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
1697 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
1698 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
1699 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
1700 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
1701 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
1702 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
1703 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4
1704 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
1705 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
1706 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4
1707 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4
1708 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4
1709 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4
1710 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4
1711 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4
1712 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4
1713 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
1714 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
1715 ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
1716 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
1717 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
1718 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
1719 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4
1720 ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5
1721 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
1722 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4
1723 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
1724 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
1725 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
1726 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
1727 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
1728 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1729 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2
1730 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6
1731 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2
1732 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
1733 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
1734 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
1735 ; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
1736 ; GFX10-DL-XNACK-NEXT: s_endpgm
1738 ; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
1739 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
1740 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1741 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1742 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1743 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1744 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1745 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
1746 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
1747 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
1748 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
1749 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
1750 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
1751 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
1752 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
1753 ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
1754 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
1755 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4
1756 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
1757 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
1758 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4
1759 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4
1760 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4
1761 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4
1762 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4
1763 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4
1764 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4
1765 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
1766 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
1767 ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
1768 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
1769 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
1770 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
1771 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4
1772 ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5
1773 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
1774 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4
1775 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
1776 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
1777 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4
1778 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
1779 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
1780 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
1781 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0
1782 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6
1783 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0
1784 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
1785 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
1786 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
1787 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
1788 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
1789 ; GFX10-DL-LABEL: idot8_multiuses_mul1:
1790 ; GFX10-DL: ; %bb.0: ; %entry
1791 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1792 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1793 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
1794 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
1795 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
1796 ; GFX10-DL-NEXT: s_clause 0x1
1797 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1798 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1799 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
1800 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1801 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1802 ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
1803 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
1804 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
1805 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1806 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
1807 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40000
1808 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40000
1809 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0
1810 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v0
1811 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004
1812 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40004
1813 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1814 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40008
1815 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40008
1816 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1817 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x4000c
1818 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x4000c
1819 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1820 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
1821 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
1822 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1823 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
1824 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
1825 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1826 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
1827 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
1828 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
1829 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
1830 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
1831 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
1832 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1
1833 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
1834 ; GFX10-DL-NEXT: s_endpgm
1835 <8 x i4> addrspace(1)* %src2,
1836 i32 addrspace(1)* nocapture %dst) {
1838 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1839 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1840 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1841 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1842 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1844 %v1e0 = extractelement <8 x i4> %vec1, i64 0
1845 %cv1e0 = sext i4 %v1e0 to i32
1846 %v2e0 = extractelement <8 x i4> %vec2, i64 0
1847 %cv2e0 = sext i4 %v2e0 to i32
1848 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1850 %v1e1 = extractelement <8 x i4> %vec1, i64 1
1851 %cv1e1 = sext i4 %v1e1 to i32
1852 %v2e1 = extractelement <8 x i4> %vec2, i64 1
1853 %cv2e1 = sext i4 %v2e1 to i32
1854 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1856 %v1e2 = extractelement <8 x i4> %vec1, i64 2
1857 %cv1e2 = sext i4 %v1e2 to i32
1858 %v2e2 = extractelement <8 x i4> %vec2, i64 2
1859 %cv2e2 = sext i4 %v2e2 to i32
1860 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1862 %v1e3 = extractelement <8 x i4> %vec1, i64 3
1863 %cv1e3 = sext i4 %v1e3 to i32
1864 %v2e3 = extractelement <8 x i4> %vec2, i64 3
1865 %cv2e3 = sext i4 %v2e3 to i32
1866 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1868 %v1e4 = extractelement <8 x i4> %vec1, i64 4
1869 %cv1e4 = sext i4 %v1e4 to i32
1870 %v2e4 = extractelement <8 x i4> %vec2, i64 4
1871 %cv2e4 = sext i4 %v2e4 to i32
1872 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1874 %v1e5 = extractelement <8 x i4> %vec1, i64 5
1875 %cv1e5 = sext i4 %v1e5 to i32
1876 %v2e5 = extractelement <8 x i4> %vec2, i64 5
1877 %cv2e5 = sext i4 %v2e5 to i32
1878 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1880 %v1e6 = extractelement <8 x i4> %vec1, i64 6
1881 %cv1e6 = sext i4 %v1e6 to i32
1882 %v2e6 = extractelement <8 x i4> %vec2, i64 6
1883 %cv2e6 = sext i4 %v2e6 to i32
1884 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1886 %v1e7 = extractelement <8 x i4> %vec1, i64 7
1887 %cv1e7 = sext i4 %v1e7 to i32
1888 %v2e7 = extractelement <8 x i4> %vec2, i64 7
1889 %cv2e7 = sext i4 %v2e7 to i32
1890 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1892 %acc = load i32, i32 addrspace(1)* %dst, align 4
1893 %add = add i32 %mul0, %acc
1894 %add1 = add i32 %mul0, %add
1895 %add2 = add i32 %add1, %mul1
1896 %add3 = add i32 %add2, %mul2
1897 %add4 = add i32 %add3, %mul3
1898 %add5 = add i32 %add4, %mul4
1899 %add6 = add i32 %add5, %mul5
1900 %add7 = add i32 %add6, %mul6
1901 %add8 = add i32 %add7, %mul7
1903 %res = add i32 %add, %add8
1904 store i32 %res, i32 addrspace(1)* %dst, align 4
1908 ; TODO: Support this pattern.
1909 define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1910 ; GFX7-LABEL: idot8_acc32_vecMul:
1911 ; GFX7: ; %bb.0: ; %entry
1912 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1913 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1914 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1915 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1916 ; GFX7-NEXT: s_mov_b32 s14, -1
1917 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
1918 ; GFX7-NEXT: s_add_u32 s12, s12, s3
1919 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1920 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1921 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
1922 ; GFX7-NEXT: s_mov_b32 s10, 0
1923 ; GFX7-NEXT: s_mov_b32 s11, s3
1924 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1925 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1926 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1927 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
1928 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1929 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1930 ; GFX7-NEXT: s_mov_b32 s2, -1
1931 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
1932 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1933 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2
1934 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
1935 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
1936 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
1937 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
1938 ; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4
1939 ; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4
1940 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
1941 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1942 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0
1943 ; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
1944 ; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
1945 ; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
1946 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
1947 ; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4
1948 ; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
1949 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
1950 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1951 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
1952 ; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
1953 ; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
1954 ; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
1955 ; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0
1956 ; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0
1957 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0
1958 ; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0
1959 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1960 ; GFX7-NEXT: s_endpgm
1962 ; GFX8-LABEL: idot8_acc32_vecMul:
1963 ; GFX8: ; %bb.0: ; %entry
1964 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1965 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1966 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1967 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1968 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1969 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1970 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1971 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1972 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1973 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1974 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
1975 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1976 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1977 ; GFX8-NEXT: flat_load_dword v1, v[2:3]
1978 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
1979 ; GFX8-NEXT: s_mov_b32 s10, -1
1980 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
1981 ; GFX8-NEXT: s_add_u32 s8, s8, s3
1982 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
1983 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1984 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 28, v0
1985 ; GFX8-NEXT: v_bfe_i32 v3, v0, 24, 4
1986 ; GFX8-NEXT: v_bfe_i32 v4, v0, 20, 4
1987 ; GFX8-NEXT: v_bfe_i32 v5, v0, 16, 4
1988 ; GFX8-NEXT: v_bfe_i32 v6, v0, 12, 4
1989 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
1990 ; GFX8-NEXT: v_bfe_i32 v8, v0, 4, 4
1991 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4
1992 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1993 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v1
1994 ; GFX8-NEXT: v_bfe_i32 v10, v1, 24, 4
1995 ; GFX8-NEXT: v_bfe_i32 v11, v1, 20, 4
1996 ; GFX8-NEXT: v_bfe_i32 v12, v1, 16, 4
1997 ; GFX8-NEXT: v_bfe_i32 v13, v1, 12, 4
1998 ; GFX8-NEXT: v_bfe_i32 v14, v1, 8, 4
1999 ; GFX8-NEXT: v_bfe_i32 v15, v1, 4, 4
2000 ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 4
2001 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2002 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s2
2003 ; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0
2004 ; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0
2005 ; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0
2006 ; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0
2007 ; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0
2008 ; GFX8-NEXT: v_mad_i32_i24 v0, v3, v10, v0
2009 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v9, v0
2010 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2011 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2012 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2013 ; GFX8-NEXT: s_endpgm
2015 ; GFX9-LABEL: idot8_acc32_vecMul:
2016 ; GFX9: ; %bb.0: ; %entry
2017 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2018 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2019 ; GFX9-NEXT: s_mov_b32 s10, -1
2020 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2021 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2022 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2023 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2024 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2025 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2026 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2027 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
2028 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2029 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
2030 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2031 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2032 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1
2033 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2034 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2
2035 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4
2036 ; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4
2037 ; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4
2038 ; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4
2039 ; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4
2040 ; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4
2041 ; GFX9-NEXT: v_bfe_i32 v7, v1, 12, 4
2042 ; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4
2043 ; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4
2044 ; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4
2045 ; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4
2046 ; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4
2047 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4
2048 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4
2049 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2
2050 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16
2051 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15
2052 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14
2053 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2054 ; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
2055 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13
2056 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12
2057 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
2058 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, v4, v11
2059 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10
2060 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
2061 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
2062 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
2063 ; GFX9-NEXT: s_endpgm
2065 ; GFX9-DL-LABEL: idot8_acc32_vecMul:
2066 ; GFX9-DL: ; %bb.0: ; %entry
2067 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2068 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2069 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2070 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2071 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2072 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2073 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2074 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2075 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
2076 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2077 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2078 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
2079 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
2080 ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
2081 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2082 ; GFX9-DL-NEXT: v_dot8_i32_i4 v0, v2, v3, s0
2083 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
2084 ; GFX9-DL-NEXT: s_endpgm
2086 ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul:
2087 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
2088 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2089 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2090 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2091 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2092 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2093 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
2094 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
2095 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
2096 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
2097 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
2098 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
2099 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
2100 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
2101 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
2102 ; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
2103 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2104 ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
2105 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
2106 ; GFX10-DL-XNACK-NEXT: s_endpgm
2108 ; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
2109 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
2110 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2111 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2112 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2113 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2114 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
2115 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2116 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
2117 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
2118 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
2119 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
2120 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
2121 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
2122 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
2123 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
2124 ; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
2125 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2126 ; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
2127 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
2128 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
2129 ; GFX10-DL-LABEL: idot8_acc32_vecMul:
2130 ; GFX10-DL: ; %bb.0: ; %entry
2131 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2132 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2133 ; GFX10-DL-NEXT: s_mov_b32 s10, -1
2134 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
2135 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
2136 ; GFX10-DL-NEXT: s_clause 0x1
2137 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
2138 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2139 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
2140 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
2141 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2142 ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
2143 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
2144 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
2145 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2146 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
2147 ; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
2148 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
2149 ; GFX10-DL-NEXT: s_endpgm
2150 <8 x i4> addrspace(1)* %src2,
2151 i32 addrspace(1)* nocapture %dst) {
2153 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2154 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2155 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2156 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2157 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2159 %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
2160 %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
2162 %mul = mul <8 x i32> %cvec1, %cvec2
2163 %mul0 = extractelement <8 x i32> %mul, i64 0
2164 %mul1 = extractelement <8 x i32> %mul, i64 1
2165 %mul2 = extractelement <8 x i32> %mul, i64 2
2166 %mul3 = extractelement <8 x i32> %mul, i64 3
2167 %mul4 = extractelement <8 x i32> %mul, i64 4
2168 %mul5 = extractelement <8 x i32> %mul, i64 5
2169 %mul6 = extractelement <8 x i32> %mul, i64 6
2170 %mul7 = extractelement <8 x i32> %mul, i64 7
2172 %acc = load i32, i32 addrspace(1)* %dst, align 4
2173 %add1 = add i32 %mul0, %acc
2174 %add2 = add i32 %add1, %mul1
2175 %add3 = add i32 %add2, %mul2
2176 %add4 = add i32 %add3, %mul3
2177 %add5 = add i32 %add4, %mul4
2178 %add6 = add i32 %add5, %mul5
2179 %add7 = add i32 %add6, %mul6
2180 %add8 = add i32 %add7, %mul7
2182 store i32 %add8, i32 addrspace(1)* %dst, align 4
2186 ; TODO: Support this pattern.
2187 define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2188 ; GFX7-LABEL: idot8_acc16_vecMul:
2189 ; GFX7: ; %bb.0: ; %entry
2190 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2191 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2192 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2193 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2194 ; GFX7-NEXT: s_mov_b32 s14, -1
2195 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2196 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2197 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2198 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2199 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2200 ; GFX7-NEXT: s_mov_b32 s10, 0
2201 ; GFX7-NEXT: s_mov_b32 s11, s3
2202 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2203 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2204 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
2205 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2206 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2207 ; GFX7-NEXT: s_mov_b32 s4, 0xffff
2208 ; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff
2209 ; GFX7-NEXT: s_mov_b32 s2, -1
2210 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2211 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2212 ; GFX7-NEXT: v_bfe_i32 v1, v3, 20, 4
2213 ; GFX7-NEXT: v_bfe_i32 v4, v3, 16, 4
2214 ; GFX7-NEXT: v_bfe_i32 v5, v3, 4, 4
2215 ; GFX7-NEXT: v_bfe_i32 v6, v3, 0, 4
2216 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2217 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
2218 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2219 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
2220 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2221 ; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4
2222 ; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4
2223 ; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4
2224 ; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4
2225 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1
2226 ; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
2227 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10
2228 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v11
2229 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12
2230 ; GFX7-NEXT: v_and_b32_e32 v11, v2, v13
2231 ; GFX7-NEXT: v_bfe_i32 v7, v3, 24, 4
2232 ; GFX7-NEXT: v_bfe_i32 v8, v3, 8, 4
2233 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v3
2234 ; GFX7-NEXT: v_bfe_i32 v3, v3, 12, 4
2235 ; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4
2236 ; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4
2237 ; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0
2238 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4
2239 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
2240 ; GFX7-NEXT: v_or_b32_e32 v6, v11, v10
2241 ; GFX7-NEXT: v_and_b32_e32 v12, v2, v14
2242 ; GFX7-NEXT: v_and_b32_e32 v13, v2, v15
2243 ; GFX7-NEXT: v_and_b32_e32 v14, v2, v16
2244 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v1
2245 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4
2246 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6
2247 ; GFX7-NEXT: v_and_b32_e32 v3, v2, v3
2248 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v9
2249 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0
2250 ; GFX7-NEXT: v_and_b32_e32 v4, v2, v4
2251 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
2252 ; GFX7-NEXT: v_and_b32_e32 v6, v2, v6
2253 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2254 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v5
2255 ; GFX7-NEXT: buffer_load_ushort v5, off, s[0:3], 0
2256 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
2257 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
2258 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2259 ; GFX7-NEXT: v_mad_u32_u24 v4, v4, v6, v5
2260 ; GFX7-NEXT: v_mad_u32_u24 v4, v16, v11, v4
2261 ; GFX7-NEXT: v_mad_u32_u24 v4, v8, v13, v4
2262 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v4
2263 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v2, v0
2264 ; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0
2265 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0
2266 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0
2267 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2268 ; GFX7-NEXT: s_endpgm
2270 ; GFX8-LABEL: idot8_acc16_vecMul:
2271 ; GFX8: ; %bb.0: ; %entry
2272 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2273 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2274 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2275 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
2276 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2277 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2278 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2279 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2280 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2281 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2282 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2283 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2284 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2285 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2286 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2287 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2288 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
2289 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2290 ; GFX8-NEXT: s_mov_b32 s10, -1
2291 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2292 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2293 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2294 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2295 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3
2296 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
2297 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
2298 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3
2299 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3
2300 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2301 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2
2302 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2
2303 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
2304 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2
2305 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2
2306 ; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2307 ; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2308 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2309 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2310 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3
2311 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2
2312 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
2313 ; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
2314 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2315 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
2316 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
2317 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
2318 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2319 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
2320 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2321 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2322 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
2323 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
2324 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2325 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2326 ; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
2327 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2328 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2329 ; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2
2330 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
2331 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
2332 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
2333 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2334 ; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2
2335 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2336 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
2337 ; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2
2338 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
2339 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
2340 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
2341 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
2342 ; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2
2343 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
2344 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
2345 ; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
2346 ; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2
2347 ; GFX8-NEXT: flat_store_short v[0:1], v2
2348 ; GFX8-NEXT: s_endpgm
2350 ; GFX9-LABEL: idot8_acc16_vecMul:
2351 ; GFX9: ; %bb.0: ; %entry
2352 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2353 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2354 ; GFX9-NEXT: s_mov_b32 s10, -1
2355 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
2356 ; GFX9-NEXT: s_add_u32 s8, s8, s3
2357 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2358 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2359 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2360 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
2361 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2362 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2363 ; GFX9-NEXT: global_load_dword v3, v0, s[4:5]
2364 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
2365 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
2366 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2367 ; GFX9-NEXT: v_and_b32_e32 v10, 15, v3
2368 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2369 ; GFX9-NEXT: v_and_b32_e32 v17, 15, v4
2370 ; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4
2371 ; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4
2372 ; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4
2373 ; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4
2374 ; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4
2375 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3
2376 ; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4
2377 ; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4
2378 ; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4
2379 ; GFX9-NEXT: v_and_b32_e32 v10, v2, v10
2380 ; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4
2381 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4
2382 ; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4
2383 ; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4
2384 ; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4
2385 ; GFX9-NEXT: v_and_b32_e32 v17, v2, v17
2386 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10
2387 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17
2388 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1]
2389 ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
2390 ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2391 ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
2392 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v4
2393 ; GFX9-NEXT: global_load_ushort v4, v1, s[2:3]
2394 ; GFX9-NEXT: v_and_b32_e32 v8, v2, v8
2395 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
2396 ; GFX9-NEXT: v_and_b32_e32 v15, v2, v15
2397 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
2398 ; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8
2399 ; GFX9-NEXT: v_lshl_or_b32 v5, v16, 16, v15
2400 ; GFX9-NEXT: v_and_b32_e32 v6, v2, v6
2401 ; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
2402 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
2403 ; GFX9-NEXT: v_and_b32_e32 v13, v2, v13
2404 ; GFX9-NEXT: v_and_b32_e32 v2, v2, v11
2405 ; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6
2406 ; GFX9-NEXT: v_lshl_or_b32 v7, v14, 16, v13
2407 ; GFX9-NEXT: v_lshl_or_b32 v2, v12, 16, v2
2408 ; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
2409 ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
2410 ; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
2411 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
2412 ; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2413 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
2414 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v5
2415 ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
2416 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
2417 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2418 ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2419 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
2420 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v6, v7
2421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2422 ; GFX9-NEXT: v_add_u16_e32 v4, v3, v4
2423 ; GFX9-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2424 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
2425 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2426 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
2427 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2428 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v0
2429 ; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2430 ; GFX9-NEXT: global_store_short v1, v0, s[2:3]
2431 ; GFX9-NEXT: s_endpgm
2433 ; GFX9-DL-LABEL: idot8_acc16_vecMul:
2434 ; GFX9-DL: ; %bb.0: ; %entry
2435 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2436 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2437 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
2438 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
2439 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
2440 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2441 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2442 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2443 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff
2444 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
2445 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2446 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
2447 ; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7]
2448 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
2449 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2450 ; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3
2451 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2452 ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4
2453 ; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4
2454 ; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4
2455 ; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4
2456 ; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4
2457 ; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4
2458 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3
2459 ; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
2460 ; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4
2461 ; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4
2462 ; GFX9-DL-NEXT: v_and_b32_e32 v10, v2, v10
2463 ; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4
2464 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4
2465 ; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4
2466 ; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4
2467 ; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4
2468 ; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17
2469 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v10
2470 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17
2471 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1]
2472 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
2473 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2474 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
2475 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
2476 ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3]
2477 ; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8
2478 ; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0
2479 ; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15
2480 ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0
2481 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8
2482 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v16, 16, v15
2483 ; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6
2484 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
2485 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
2486 ; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13
2487 ; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v11
2488 ; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6
2489 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v14, 16, v13
2490 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v12, 16, v2
2491 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
2492 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
2493 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
2494 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
2495 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2496 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
2497 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v5
2498 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
2499 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
2500 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2501 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2502 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
2503 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v6, v7
2504 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2505 ; GFX9-DL-NEXT: v_add_u16_e32 v4, v3, v4
2506 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2507 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
2508 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2509 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
2510 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2511 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0
2512 ; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2513 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3]
2514 ; GFX9-DL-NEXT: s_endpgm
2516 ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
2517 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
2518 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2519 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2520 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2521 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff
2522 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2523 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2524 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
2525 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
2526 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
2527 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
2528 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
2529 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
2530 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
2531 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
2532 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
2533 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
2534 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
2535 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 15, v1
2536 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
2537 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2
2538 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 16, 4
2539 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 8, 4
2540 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 24, 4
2541 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
2542 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 20, 4
2543 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 12, 4
2544 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v1, v1, 4, 4
2545 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, v4, v11
2546 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4
2547 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13
2548 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 8, 4
2549 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, v4, v9
2550 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11
2551 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7
2552 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13
2553 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4
2554 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2
2555 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
2556 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4
2557 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 20, 4
2558 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 12, 4
2559 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v18
2560 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]
2561 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9
2562 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7
2563 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2564 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v2, 16, v13
2565 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
2566 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
2567 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v15
2568 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2569 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
2570 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8
2571 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
2572 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10
2573 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2574 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2575 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
2576 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
2577 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
2578 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5
2579 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
2580 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v12
2581 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2
2582 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10
2583 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
2584 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
2585 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4
2586 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2
2587 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2
2588 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1]
2589 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
2590 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
2591 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v6
2592 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2593 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2594 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2595 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4
2596 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3
2597 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5
2598 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2599 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2
2600 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
2601 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
2602 ; GFX10-DL-XNACK-NEXT: s_endpgm
2604 ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
2605 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
2606 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2607 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2608 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2609 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
2610 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff
2611 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2612 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2613 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
2614 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
2615 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
2616 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
2617 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
2618 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
2619 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
2620 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
2621 ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
2622 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
2623 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 15, v1
2624 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
2625 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0
2626 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 16, 4
2627 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 8, 4
2628 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 24, 4
2629 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
2630 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 20, 4
2631 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 12, 4
2632 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v1, v1, 4, 4
2633 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, v4, v11
2634 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4
2635 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13
2636 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 8, 4
2637 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, v4, v9
2638 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11
2639 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7
2640 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13
2641 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4
2642 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0
2643 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
2644 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4
2645 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 20, 4
2646 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 12, 4
2647 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v18
2648 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]
2649 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9
2650 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7
2651 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2652 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v0, 16, v13
2653 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
2654 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
2655 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v15
2656 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2657 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
2658 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8
2659 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
2660 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10
2661 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2662 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
2663 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
2664 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
2665 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3
2666 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5
2667 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
2668 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v12
2669 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0
2670 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10
2671 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
2672 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
2673 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4
2674 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
2675 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
2676 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1]
2677 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
2678 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
2679 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6
2680 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2681 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2682 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2683 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4
2684 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3
2685 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5
2686 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2687 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
2688 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
2689 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
2690 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
2691 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
2692 ; GFX10-DL: ; %bb.0: ; %entry
2693 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
2694 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
2695 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2696 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2697 ; GFX10-DL-NEXT: s_mov_b32 s14, -1
2698 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
2699 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
2700 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2701 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
2702 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2703 ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
2704 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
2705 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
2706 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2707 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
2708 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28
2709 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
2710 ; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014
2711 ; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008
2712 ; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c
2713 ; GFX10-DL-NEXT: s_and_b32 s10, s0, 15
2714 ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004
2715 ; GFX10-DL-NEXT: s_and_b32 s11, s1, 15
2716 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0
2717 ; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004
2718 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
2719 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10
2720 ; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c
2721 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
2722 ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008
2723 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2724 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9
2725 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11
2726 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2727 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
2728 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
2729 ; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010
2730 ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014
2731 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
2732 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
2733 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
2734 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7
2735 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0
2736 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
2737 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2738 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
2739 ; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
2740 ; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28
2741 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3
2742 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2743 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0
2744 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2745 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2746 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
2747 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2748 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
2749 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
2750 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2751 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4
2752 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2753 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2754 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
2755 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
2756 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
2757 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2758 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3
2759 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2760 ; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
2761 ; GFX10-DL-NEXT: s_endpgm
2762 <8 x i4> addrspace(1)* %src2,
2763 i16 addrspace(1)* nocapture %dst) {
2765 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2766 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2767 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2768 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2769 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2771 %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
2772 %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
2774 %mul = mul <8 x i16> %cvec1, %cvec2
2775 %mul0 = extractelement <8 x i16> %mul, i64 0
2776 %mul1 = extractelement <8 x i16> %mul, i64 1
2777 %mul2 = extractelement <8 x i16> %mul, i64 2
2778 %mul3 = extractelement <8 x i16> %mul, i64 3
2779 %mul4 = extractelement <8 x i16> %mul, i64 4
2780 %mul5 = extractelement <8 x i16> %mul, i64 5
2781 %mul6 = extractelement <8 x i16> %mul, i64 6
2782 %mul7 = extractelement <8 x i16> %mul, i64 7
2784 %acc = load i16, i16 addrspace(1)* %dst, align 4
2785 %add1 = add i16 %mul0, %acc
2786 %add2 = add i16 %add1, %mul1
2787 %add3 = add i16 %add2, %mul2
2788 %add4 = add i16 %add3, %mul3
2789 %add5 = add i16 %add4, %mul4
2790 %add6 = add i16 %add5, %mul5
2791 %add7 = add i16 %add6, %mul6
2792 %add8 = add i16 %add7, %mul7
2794 store i16 %add8, i16 addrspace(1)* %dst, align 4
2798 ; TODO: Support this pattern.
2799 define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2800 ; GFX7-LABEL: idot8_acc8_vecMul:
2801 ; GFX7: ; %bb.0: ; %entry
2802 ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2803 ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2804 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2805 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2806 ; GFX7-NEXT: s_mov_b32 s14, -1
2807 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
2808 ; GFX7-NEXT: s_add_u32 s12, s12, s3
2809 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2810 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2811 ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
2812 ; GFX7-NEXT: s_mov_b32 s10, 0
2813 ; GFX7-NEXT: s_mov_b32 s11, s3
2814 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2815 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2816 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
2817 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
2818 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2819 ; GFX7-NEXT: s_movk_i32 s4, 0xff
2820 ; GFX7-NEXT: s_mov_b32 s2, -1
2821 ; GFX7-NEXT: v_mov_b32_e32 v2, 0xff
2822 ; GFX7-NEXT: s_mov_b32 s5, 0xffff
2823 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff
2824 ; GFX7-NEXT: s_addc_u32 s13, s13, 0
2825 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2826 ; GFX7-NEXT: v_bfe_i32 v6, v4, 20, 4
2827 ; GFX7-NEXT: v_bfe_i32 v7, v4, 16, 4
2828 ; GFX7-NEXT: v_bfe_i32 v8, v4, 12, 4
2829 ; GFX7-NEXT: v_bfe_i32 v9, v4, 8, 4
2830 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
2831 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
2832 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8
2833 ; GFX7-NEXT: v_and_b32_e32 v9, s4, v9
2834 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2835 ; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4
2836 ; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
2837 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
2838 ; GFX7-NEXT: v_or_b32_e32 v7, v9, v8
2839 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v13
2840 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16
2841 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0
2842 ; GFX7-NEXT: v_bfe_i32 v5, v4, 24, 4
2843 ; GFX7-NEXT: v_bfe_i32 v10, v4, 4, 4
2844 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v4
2845 ; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 4
2846 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v1
2847 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
2848 ; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v0
2849 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10
2850 ; GFX7-NEXT: v_and_b32_e32 v4, v2, v4
2851 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
2852 ; GFX7-NEXT: v_bfe_i32 v15, v0, 16, 4
2853 ; GFX7-NEXT: v_bfe_i32 v17, v0, 8, 4
2854 ; GFX7-NEXT: v_bfe_i32 v18, v0, 4, 4
2855 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
2856 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v11
2857 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v10
2858 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12
2859 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14
2860 ; GFX7-NEXT: v_and_b32_e32 v11, v2, v15
2861 ; GFX7-NEXT: v_and_b32_e32 v14, v2, v17
2862 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v18
2863 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0
2864 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v15
2865 ; GFX7-NEXT: v_or_b32_e32 v8, v9, v8
2866 ; GFX7-NEXT: v_or_b32_e32 v9, v11, v10
2867 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2868 ; GFX7-NEXT: v_and_b32_e32 v6, s5, v6
2869 ; GFX7-NEXT: v_or_b32_e32 v10, v14, v13
2870 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
2871 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8
2872 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
2873 ; GFX7-NEXT: v_and_b32_e32 v0, v3, v0
2874 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2875 ; GFX7-NEXT: v_and_b32_e32 v4, s5, v4
2876 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v8
2877 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v7
2878 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v9
2879 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v6
2880 ; GFX7-NEXT: v_and_b32_e32 v7, v2, v4
2881 ; GFX7-NEXT: v_and_b32_e32 v13, v2, v0
2882 ; GFX7-NEXT: v_bfe_u32 v8, v4, 8, 8
2883 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8
2884 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v4
2885 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0
2886 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8
2887 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
2888 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
2889 ; GFX7-NEXT: v_and_b32_e32 v12, v2, v12
2890 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v5
2891 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
2892 ; GFX7-NEXT: v_bfe_u32 v10, v5, 8, 8
2893 ; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8
2894 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8
2895 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8
2896 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2897 ; GFX7-NEXT: v_mad_u32_u24 v7, v7, v13, v16
2898 ; GFX7-NEXT: v_mad_u32_u24 v7, v8, v14, v7
2899 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v0, v7
2900 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0
2901 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v2, v0
2902 ; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0
2903 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v3, v0
2904 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v12, v0
2905 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2906 ; GFX7-NEXT: s_endpgm
2908 ; GFX8-LABEL: idot8_acc8_vecMul:
2909 ; GFX8: ; %bb.0: ; %entry
2910 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2911 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2912 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2913 ; GFX8-NEXT: v_mov_b32_e32 v5, 12
2914 ; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2915 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2916 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2917 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
2918 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2919 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2920 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2921 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
2922 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2923 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2924 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2925 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2926 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
2927 ; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2928 ; GFX8-NEXT: s_mov_b32 s10, -1
2929 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000
2930 ; GFX8-NEXT: s_add_u32 s8, s8, s3
2931 ; GFX8-NEXT: s_addc_u32 s9, s9, 0
2932 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2933 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
2934 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
2935 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
2936 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
2937 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2938 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
2939 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
2940 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
2941 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
2942 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2
2943 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
2944 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
2945 ; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2946 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2947 ; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2
2948 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2949 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2950 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10
2951 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
2952 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
2953 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16
2954 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
2955 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3
2956 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6
2957 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15
2958 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18
2959 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
2960 ; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2
2961 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11
2962 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
2963 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
2964 ; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
2965 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
2966 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
2967 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
2968 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
2969 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
2970 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14
2971 ; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15
2972 ; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2973 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
2974 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
2975 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
2976 ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2977 ; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18
2978 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2979 ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11
2980 ; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2981 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
2982 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
2983 ; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19
2984 ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2985 ; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2986 ; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2987 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
2988 ; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2989 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
2990 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
2991 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2992 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
2993 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3
2994 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
2995 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
2996 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2997 ; GFX8-NEXT: v_add_u16_e32 v3, v8, v4
2998 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
2999 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
3000 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
3001 ; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2
3002 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v6
3003 ; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
3004 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v10
3005 ; GFX8-NEXT: flat_store_byte v[0:1], v2
3006 ; GFX8-NEXT: s_endpgm
3008 ; GFX9-LABEL: idot8_acc8_vecMul:
3009 ; GFX9: ; %bb.0: ; %entry
3010 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3011 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3012 ; GFX9-NEXT: s_mov_b32 s10, -1
3013 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
3014 ; GFX9-NEXT: s_add_u32 s8, s8, s3
3015 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3016 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3017 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3018 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
3019 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
3020 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3021 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
3022 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
3023 ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]
3024 ; GFX9-NEXT: v_mov_b32_e32 v0, 12
3025 ; GFX9-NEXT: s_waitcnt vmcnt(2)
3026 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3027 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
3028 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
3029 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3030 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
3031 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1
3032 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
3033 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2
3034 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
3035 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
3036 ; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3037 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3038 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3039 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2
3040 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3041 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
3042 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
3043 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9
3044 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
3045 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
3046 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15
3047 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
3048 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1
3049 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5
3050 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14
3051 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17
3052 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
3053 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0
3054 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10
3055 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
3056 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
3057 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
3058 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
3059 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
3060 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
3061 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0
3062 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
3063 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13
3064 ; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3065 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
3066 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
3067 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
3068 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3069 ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17
3070 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3071 ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10
3072 ; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3073 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
3074 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
3075 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18
3076 ; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3077 ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14
3078 ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3079 ; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3080 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
3081 ; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3082 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
3083 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1
3084 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3085 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
3086 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3087 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
3088 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2
3089 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3090 ; GFX9-NEXT: v_add_u16_e32 v1, v7, v4
3091 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v2
3092 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
3093 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
3094 ; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
3095 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v5
3096 ; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0
3097 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v9
3098 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
3099 ; GFX9-NEXT: s_endpgm
3101 ; GFX9-DL-LABEL: idot8_acc8_vecMul:
3102 ; GFX9-DL: ; %bb.0: ; %entry
3103 ; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3104 ; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3105 ; GFX9-DL-NEXT: s_mov_b32 s10, -1
3106 ; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000
3107 ; GFX9-DL-NEXT: s_add_u32 s8, s8, s3
3108 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3109 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3110 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3111 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0
3112 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
3113 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3114 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
3115 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
3116 ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]
3117 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12
3118 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
3119 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3120 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
3121 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
3122 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3123 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
3124 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1
3125 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
3126 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2
3127 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
3128 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
3129 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3130 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3131 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3132 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2
3133 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3134 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
3135 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
3136 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9
3137 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
3138 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
3139 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15
3140 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
3141 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1
3142 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5
3143 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14
3144 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17
3145 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
3146 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0
3147 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10
3148 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
3149 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
3150 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
3151 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
3152 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
3153 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
3154 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0
3155 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
3156 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13
3157 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3158 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
3159 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
3160 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
3161 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3162 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17
3163 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3164 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10
3165 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3166 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
3167 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
3168 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18
3169 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3170 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14
3171 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3172 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3173 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1
3174 ; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3175 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
3176 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1
3177 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3178 ; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0
3179 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3180 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
3181 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2
3182 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3183 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v7, v4
3184 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2
3185 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
3186 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
3187 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
3188 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5
3189 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0
3190 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9
3191 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
3192 ; GFX9-DL-NEXT: s_endpgm
3194 ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul:
3195 ; GFX10-DL-XNACK: ; %bb.0: ; %entry
3196 ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3197 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3198 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3199 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
3200 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3201 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3202 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
3203 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
3204 ; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
3205 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
3206 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
3207 ; GFX10-DL-XNACK-NEXT: s_clause 0x1
3208 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
3209 ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
3210 ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
3211 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
3212 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
3213 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
3214 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
3215 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
3216 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2
3217 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3218 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
3219 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
3220 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2
3221 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
3222 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2
3223 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
3224 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
3225 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
3226 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15
3227 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1
3228 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
3229 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2
3230 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2
3231 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
3232 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
3233 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
3234 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15
3235 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
3236 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
3237 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
3238 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3239 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2
3240 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
3241 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0
3242 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
3243 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
3244 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
3245 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
3246 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
3247 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
3248 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
3249 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17
3250 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16
3251 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8
3252 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
3253 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
3254 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
3255 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
3256 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0
3257 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
3258 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
3259 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15
3260 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3261 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
3262 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
3263 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11
3264 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
3265 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
3266 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14
3267 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6
3268 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2
3269 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10
3270 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
3271 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12
3272 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9
3273 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3274 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3275 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3276 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3277 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6
3278 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
3279 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3
3280 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11
3281 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3282 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10
3283 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
3284 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
3285 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8
3286 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2
3287 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0
3288 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
3289 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
3290 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
3291 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
3292 ; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1]
3293 ; GFX10-DL-XNACK-NEXT: s_endpgm
3295 ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
3296 ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
3297 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3298 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3299 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3300 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
3301 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3302 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3303 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
3304 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
3305 ; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
3306 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
3307 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
3308 ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
3309 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
3310 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
3311 ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
3312 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
3313 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
3314 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
3315 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
3316 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
3317 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0
3318 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0
3319 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0
3320 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0
3321 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0
3322 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
3323 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
3324 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0
3325 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
3326 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
3327 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
3328 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
3329 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
3330 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0
3331 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16
3332 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15
3333 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1
3334 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
3335 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
3336 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
3337 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
3338 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15
3339 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
3340 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
3341 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
3342 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3343 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
3344 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3
3345 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
3346 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
3347 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
3348 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
3349 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
3350 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0
3351 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8
3352 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
3353 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17
3354 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
3355 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13
3356 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
3357 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11
3358 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3
3359 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
3360 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
3361 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
3362 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15
3363 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3364 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9
3365 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
3366 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12
3367 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14
3368 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6
3369 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10
3370 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18
3371 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
3372 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11
3373 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3
3374 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3375 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3376 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3377 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3378 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6
3379 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
3380 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2
3381 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9
3382 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3383 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9
3384 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
3385 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
3386 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8
3387 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2
3388 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0
3389 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
3390 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
3391 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
3392 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
3393 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1]
3394 ; GFX10-DL-NOXNACK-NEXT: s_endpgm
3395 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
3396 ; GFX10-DL: ; %bb.0: ; %entry
3397 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
3398 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3399 ; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
3400 ; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
3401 ; GFX10-DL-NEXT: s_mov_b32 s22, -1
3402 ; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000
3403 ; GFX10-DL-NEXT: s_add_u32 s20, s20, s3
3404 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3405 ; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0
3406 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3407 ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
3408 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
3409 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
3410 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
3411 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3412 ; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4
3413 ; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4
3414 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9
3415 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16
3416 ; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12
3417 ; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12
3418 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0
3419 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1
3420 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17
3421 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10
3422 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
3423 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
3424 ; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8
3425 ; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8
3426 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
3427 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18
3428 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
3429 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
3430 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5
3431 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12
3432 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
3433 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
3434 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
3435 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3
3436 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6
3437 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13
3438 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20
3439 ; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16
3440 ; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28
3441 ; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24
3442 ; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20
3443 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3444 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8
3445 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7
3446 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
3447 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3
3448 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12
3449 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11
3450 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
3451 ; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16
3452 ; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28
3453 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13
3454 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7
3455 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8
3456 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9
3457 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3458 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
3459 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14
3460 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10
3461 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12
3462 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24
3463 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
3464 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15
3465 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15
3466 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3
3467 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9
3468 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6
3469 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10
3470 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14
3471 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3
3472 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3473 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
3474 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4
3475 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11
3476 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7
3477 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8
3478 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3479 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3480 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
3481 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
3482 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3483 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4
3484 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
3485 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3
3486 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
3487 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3488 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3489 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
3490 ; GFX10-DL-NEXT: s_endpgm
3491 <8 x i4> addrspace(1)* %src2,
3492 i8 addrspace(1)* nocapture %dst) {
3494 %idx = call i32 @llvm.amdgcn.workitem.id.x()
3495 %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3496 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3497 %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3498 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3500 %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
3501 %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
3503 %mul = mul <8 x i8> %cvec1, %cvec2
3504 %mul0 = extractelement <8 x i8> %mul, i64 0
3505 %mul1 = extractelement <8 x i8> %mul, i64 1
3506 %mul2 = extractelement <8 x i8> %mul, i64 2
3507 %mul3 = extractelement <8 x i8> %mul, i64 3
3508 %mul4 = extractelement <8 x i8> %mul, i64 4
3509 %mul5 = extractelement <8 x i8> %mul, i64 5
3510 %mul6 = extractelement <8 x i8> %mul, i64 6
3511 %mul7 = extractelement <8 x i8> %mul, i64 7
3513 %acc = load i8, i8 addrspace(1)* %dst, align 4
3514 %add1 = add i8 %mul0, %acc
3515 %add2 = add i8 %add1, %mul1
3516 %add3 = add i8 %add2, %mul2
3517 %add4 = add i8 %add3, %mul3
3518 %add5 = add i8 %add4, %mul4
3519 %add6 = add i8 %add5, %mul5
3520 %add7 = add i8 %add6, %mul6
3521 %add8 = add i8 %add7, %mul7
3523 store i8 %add8, i8 addrspace(1)* %dst, align 4
3527 declare i32 @llvm.amdgcn.workitem.id.x()