1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NODL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10-DL %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10-DL %s
10 ; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
12 define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
14 ; GFX7: ; %bb.0: ; %entry
15 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
16 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
17 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
18 ; GFX7-NEXT: s_mov_b32 s10, 0
19 ; GFX7-NEXT: s_mov_b32 s11, s7
20 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
21 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
22 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
23 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
24 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
25 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
26 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
27 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
28 ; GFX7-NEXT: s_mov_b32 s6, -1
29 ; GFX7-NEXT: s_waitcnt vmcnt(1)
30 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
31 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
32 ; GFX7-NEXT: s_waitcnt vmcnt(0)
33 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
34 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
35 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
36 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
37 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
38 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
42 ; GFX8: ; %bb.0: ; %entry
43 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
44 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
45 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
46 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
47 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
48 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
49 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
50 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
51 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
52 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
53 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
54 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
55 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
56 ; GFX8-NEXT: s_waitcnt vmcnt(1)
57 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
58 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
59 ; GFX8-NEXT: s_waitcnt vmcnt(0)
60 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
61 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
62 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
63 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
64 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
65 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
66 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
67 ; GFX8-NEXT: flat_store_dword v[0:1], v2
70 ; GFX9-NODL-LABEL: udot2:
71 ; GFX9-NODL: ; %bb.0: ; %entry
72 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
73 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
74 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
75 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
76 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
77 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
78 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
79 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
80 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
81 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
82 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
84 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
85 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
86 ; GFX9-NODL-NEXT: s_endpgm
88 ; GFX9-DL-LABEL: udot2:
89 ; GFX9-DL: ; %bb.0: ; %entry
90 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
91 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
92 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
93 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
95 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
96 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
97 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
98 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
99 ; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
100 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
101 ; GFX9-DL-NEXT: s_endpgm
103 ; GFX10-DL-LABEL: udot2:
104 ; GFX10-DL: ; %bb.0: ; %entry
105 ; GFX10-DL-NEXT: s_clause 0x1
106 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
107 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
108 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
109 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
110 ; GFX10-DL-NEXT: s_clause 0x1
111 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
112 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
113 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
114 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
115 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
116 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117 ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
118 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
119 ; GFX10-DL-NEXT: s_endpgm
120 ptr addrspace(1) %src2,
121 ptr addrspace(1) nocapture %dst) {
123 %idx = call i32 @llvm.amdgcn.workitem.id.x()
124 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
125 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
126 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
127 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
129 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
130 %conv = zext i16 %s1.elt1 to i32
131 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
132 %conv2 = zext i16 %s2.elt1 to i32
133 %mul1 = mul nuw i32 %conv2, %conv
135 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
136 %conv3 = zext i16 %s1.elt2 to i32
137 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
138 %conv4 = zext i16 %s2.elt2 to i32
139 %mul2 = mul nuw i32 %conv4, %conv3
141 %s3 = load i32, ptr addrspace(1) %dst, align 4
142 %add = add i32 %mul2, %s3
143 %add6 = add i32 %add, %mul1
144 store i32 %add6, ptr addrspace(1) %dst, align 4
148 ; TODO: Support this pattern
150 ; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
151 define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
152 ; GFX7-LABEL: udot2_MulMul:
153 ; GFX7: ; %bb.0: ; %entry
154 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
155 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
156 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
157 ; GFX7-NEXT: s_mov_b32 s6, 0
158 ; GFX7-NEXT: s_mov_b32 s7, s3
159 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
160 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
161 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
162 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
163 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
164 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
165 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
166 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
167 ; GFX7-NEXT: s_mov_b32 s2, -1
168 ; GFX7-NEXT: s_waitcnt vmcnt(1)
169 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
170 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
171 ; GFX7-NEXT: s_waitcnt vmcnt(0)
172 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
173 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
174 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2
175 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
176 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
177 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0
178 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
179 ; GFX7-NEXT: s_endpgm
181 ; GFX8-LABEL: udot2_MulMul:
182 ; GFX8: ; %bb.0: ; %entry
183 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
184 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
185 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
186 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
187 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
188 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
189 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
190 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
191 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
192 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
193 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
194 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
195 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
196 ; GFX8-NEXT: s_waitcnt vmcnt(1)
197 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
198 ; GFX8-NEXT: s_waitcnt vmcnt(0)
199 ; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
200 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
201 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1
202 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
204 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
205 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
206 ; GFX8-NEXT: flat_store_dword v[0:1], v2
207 ; GFX8-NEXT: s_endpgm
209 ; GFX9-NODL-LABEL: udot2_MulMul:
210 ; GFX9-NODL: ; %bb.0: ; %entry
211 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
212 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
213 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
214 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
215 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
216 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
217 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
218 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
219 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
220 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
221 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
222 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0
224 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
225 ; GFX9-NODL-NEXT: s_endpgm
227 ; GFX9-DL-LABEL: udot2_MulMul:
228 ; GFX9-DL: ; %bb.0: ; %entry
229 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
230 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
231 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
232 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
233 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
234 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
235 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
236 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
237 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
238 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
239 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
240 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0
242 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
243 ; GFX9-DL-NEXT: s_endpgm
245 ; GFX10-DL-LABEL: udot2_MulMul:
246 ; GFX10-DL: ; %bb.0: ; %entry
247 ; GFX10-DL-NEXT: s_clause 0x1
248 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
249 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
250 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
251 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
252 ; GFX10-DL-NEXT: s_clause 0x1
253 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
254 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
255 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
256 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
257 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
258 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
259 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
260 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
261 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
262 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
263 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
264 ; GFX10-DL-NEXT: s_endpgm
265 ptr addrspace(1) %src2,
266 ptr addrspace(1) nocapture %dst) {
268 %idx = call i32 @llvm.amdgcn.workitem.id.x()
269 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
270 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
271 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
272 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
274 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
275 %conv = zext i16 %s1.elt1 to i32
276 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
277 %conv2 = zext i16 %s2.elt1 to i32
278 %mul1 = mul nuw i32 %conv2, %conv
280 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
281 %conv3 = zext i16 %s1.elt2 to i32
282 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
283 %conv4 = zext i16 %s2.elt2 to i32
284 %mul2 = mul nuw i32 %conv4, %conv3
285 %s3 = load i32, ptr addrspace(1) %dst, align 4
286 %add = add i32 %mul2, %mul1
287 %add6 = add i32 %add, %s3
288 store i32 %add6, ptr addrspace(1) %dst, align 4
292 define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
294 ; GFX7: ; %bb.0: ; %entry
295 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
296 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
297 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
298 ; GFX7-NEXT: s_mov_b32 s10, 0
299 ; GFX7-NEXT: s_mov_b32 s11, s7
300 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
301 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
302 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
303 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
304 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
305 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
306 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
307 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
308 ; GFX7-NEXT: s_mov_b32 s6, -1
309 ; GFX7-NEXT: s_waitcnt vmcnt(1)
310 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
311 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
312 ; GFX7-NEXT: s_waitcnt vmcnt(0)
313 ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
314 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
315 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
316 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
317 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
318 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
319 ; GFX7-NEXT: s_endpgm
322 ; GFX8: ; %bb.0: ; %entry
323 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
324 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
325 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
326 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
327 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
328 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
329 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
330 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
331 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
332 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
333 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
334 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
335 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
336 ; GFX8-NEXT: s_waitcnt vmcnt(1)
337 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
338 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
339 ; GFX8-NEXT: s_waitcnt vmcnt(0)
340 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
341 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
342 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
343 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
344 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
345 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
346 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
347 ; GFX8-NEXT: flat_store_dword v[0:1], v2
348 ; GFX8-NEXT: s_endpgm
350 ; GFX9-NODL-LABEL: idot2:
351 ; GFX9-NODL: ; %bb.0: ; %entry
352 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
353 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
354 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
356 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
357 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
358 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
359 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
360 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
361 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
362 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
363 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
365 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
366 ; GFX9-NODL-NEXT: s_endpgm
368 ; GFX9-DL-LABEL: idot2:
369 ; GFX9-DL: ; %bb.0: ; %entry
370 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
371 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
372 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
375 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
376 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
377 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
378 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
379 ; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
380 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
381 ; GFX9-DL-NEXT: s_endpgm
383 ; GFX10-DL-LABEL: idot2:
384 ; GFX10-DL: ; %bb.0: ; %entry
385 ; GFX10-DL-NEXT: s_clause 0x1
386 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
387 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
388 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
389 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
390 ; GFX10-DL-NEXT: s_clause 0x1
391 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
392 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
393 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
394 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
395 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
396 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397 ; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
398 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
399 ; GFX10-DL-NEXT: s_endpgm
400 ptr addrspace(1) %src2,
401 ptr addrspace(1) nocapture %dst) {
403 %idx = call i32 @llvm.amdgcn.workitem.id.x()
404 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
405 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
406 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
407 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
409 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
410 %conv = sext i16 %s1.elt1 to i32
411 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
412 %conv2 = sext i16 %s2.elt1 to i32
413 %mul1 = mul nuw i32 %conv2, %conv
415 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
416 %conv3 = sext i16 %s1.elt2 to i32
417 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
418 %conv4 = sext i16 %s2.elt2 to i32
419 %mul2 = mul nuw i32 %conv4, %conv3
421 %s3 = load i32, ptr addrspace(1) %dst, align 4
422 %add = add i32 %mul2, %s3
423 %add6 = add i32 %add, %mul1
424 store i32 %add6, ptr addrspace(1) %dst, align 4
428 define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
429 ; GFX7-LABEL: idot2_MixedTypedMul:
430 ; GFX7: ; %bb.0: ; %entry
431 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
432 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
433 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
434 ; GFX7-NEXT: s_mov_b32 s10, 0
435 ; GFX7-NEXT: s_mov_b32 s11, s7
436 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
437 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
438 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
439 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
440 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
441 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
442 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
443 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
444 ; GFX7-NEXT: s_mov_b32 s6, -1
445 ; GFX7-NEXT: s_waitcnt vmcnt(1)
446 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
447 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
448 ; GFX7-NEXT: s_waitcnt vmcnt(0)
449 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
450 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
451 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
453 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v1
454 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
455 ; GFX7-NEXT: s_endpgm
457 ; GFX8-LABEL: idot2_MixedTypedMul:
458 ; GFX8: ; %bb.0: ; %entry
459 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
460 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
461 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
462 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
463 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
464 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
465 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
466 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
467 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
468 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
469 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
470 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
471 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
472 ; GFX8-NEXT: s_waitcnt vmcnt(1)
473 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
474 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
475 ; GFX8-NEXT: s_waitcnt vmcnt(0)
476 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
477 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
478 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
479 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
480 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
481 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
482 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
483 ; GFX8-NEXT: flat_store_dword v[0:1], v2
484 ; GFX8-NEXT: s_endpgm
486 ; GFX9-NODL-LABEL: idot2_MixedTypedMul:
487 ; GFX9-NODL: ; %bb.0: ; %entry
488 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
489 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
490 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
491 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
493 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
494 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
495 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
496 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
497 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
498 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
499 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
500 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
501 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
502 ; GFX9-NODL-NEXT: s_endpgm
504 ; GFX9-DL-LABEL: idot2_MixedTypedMul:
505 ; GFX9-DL: ; %bb.0: ; %entry
506 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
507 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
508 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
509 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
511 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
512 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
513 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
514 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
515 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
516 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
517 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
518 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
519 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
520 ; GFX9-DL-NEXT: s_endpgm
522 ; GFX10-DL-LABEL: idot2_MixedTypedMul:
523 ; GFX10-DL: ; %bb.0: ; %entry
524 ; GFX10-DL-NEXT: s_clause 0x1
525 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
526 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
527 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
528 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
529 ; GFX10-DL-NEXT: s_clause 0x1
530 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
531 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
532 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
533 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
534 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
535 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
536 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
537 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
538 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
539 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
540 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
541 ; GFX10-DL-NEXT: s_endpgm
542 ptr addrspace(1) %src2,
543 ptr addrspace(1) nocapture %dst) {
545 %idx = call i32 @llvm.amdgcn.workitem.id.x()
546 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
547 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
548 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
549 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
551 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
552 %conv = sext i16 %s1.elt1 to i32
553 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
554 %conv2 = sext i16 %s2.elt1 to i32
555 %mul1 = mul nuw i32 %conv2, %conv
557 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
558 %conv3 = zext i16 %s1.elt2 to i32
559 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
560 %conv4 = zext i16 %s2.elt2 to i32
561 %mul2 = mul nuw i32 %conv4, %conv3
563 %s3 = load i32, ptr addrspace(1) %dst, align 4
564 %add = add i32 %mul2, %s3
565 %add6 = add i32 %add, %mul1
566 store i32 %add6, ptr addrspace(1) %dst, align 4
570 define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
571 ; GFX7-LABEL: udot2_alt_AddOperands:
572 ; GFX7: ; %bb.0: ; %entry
573 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
574 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
575 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
576 ; GFX7-NEXT: s_mov_b32 s10, 0
577 ; GFX7-NEXT: s_mov_b32 s11, s7
578 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
580 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
581 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
582 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
583 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
584 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
585 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
586 ; GFX7-NEXT: s_mov_b32 s6, -1
587 ; GFX7-NEXT: s_waitcnt vmcnt(1)
588 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
589 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
590 ; GFX7-NEXT: s_waitcnt vmcnt(0)
591 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
592 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
593 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
594 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
595 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
596 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
597 ; GFX7-NEXT: s_endpgm
599 ; GFX8-LABEL: udot2_alt_AddOperands:
600 ; GFX8: ; %bb.0: ; %entry
601 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
602 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
603 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
604 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
605 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
606 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
607 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
608 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
609 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
610 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
611 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
612 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
613 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
614 ; GFX8-NEXT: s_waitcnt vmcnt(1)
615 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
616 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
617 ; GFX8-NEXT: s_waitcnt vmcnt(0)
618 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
619 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
620 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
621 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
622 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
623 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
624 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
625 ; GFX8-NEXT: flat_store_dword v[0:1], v2
626 ; GFX8-NEXT: s_endpgm
628 ; GFX9-NODL-LABEL: udot2_alt_AddOperands:
629 ; GFX9-NODL: ; %bb.0: ; %entry
630 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
631 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
632 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
633 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
634 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
635 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
636 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
637 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
638 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
639 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
640 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
641 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
642 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
643 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
644 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
645 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
646 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
647 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
648 ; GFX9-NODL-NEXT: s_endpgm
650 ; GFX9-DL-LABEL: udot2_alt_AddOperands:
651 ; GFX9-DL: ; %bb.0: ; %entry
652 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
653 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
654 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
655 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
656 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
657 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
658 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
659 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
660 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
661 ; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
662 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
663 ; GFX9-DL-NEXT: s_endpgm
665 ; GFX10-DL-LABEL: udot2_alt_AddOperands:
666 ; GFX10-DL: ; %bb.0: ; %entry
667 ; GFX10-DL-NEXT: s_clause 0x1
668 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
669 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
670 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
671 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
672 ; GFX10-DL-NEXT: s_clause 0x1
673 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
674 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
675 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
676 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
677 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
678 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
679 ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
680 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
681 ; GFX10-DL-NEXT: s_endpgm
682 ptr addrspace(1) %src2,
683 ptr addrspace(1) nocapture %dst) {
685 %idx = call i32 @llvm.amdgcn.workitem.id.x()
686 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
687 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
688 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
689 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
691 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
692 %conv = zext i16 %s1.elt1 to i32
693 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
694 %conv2 = zext i16 %s2.elt1 to i32
695 %mul1 = mul nuw i32 %conv2, %conv
697 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
698 %conv3 = zext i16 %s1.elt2 to i32
699 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
700 %conv4 = zext i16 %s2.elt2 to i32
701 %mul2 = mul nuw i32 %conv4, %conv3
703 %s3 = load i32, ptr addrspace(1) %dst, align 4
704 %add = add i32 %s3, %mul2
705 %add6 = add i32 %mul1, %add
706 store i32 %add6, ptr addrspace(1) %dst, align 4
710 define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
711 ; GFX7-LABEL: idot2_MixedExt:
712 ; GFX7: ; %bb.0: ; %entry
713 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
714 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
715 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
716 ; GFX7-NEXT: s_mov_b32 s10, 0
717 ; GFX7-NEXT: s_mov_b32 s11, s7
718 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
719 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
720 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
721 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
722 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
723 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
724 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
725 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
726 ; GFX7-NEXT: s_mov_b32 s6, -1
727 ; GFX7-NEXT: s_waitcnt vmcnt(1)
728 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
729 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
730 ; GFX7-NEXT: s_waitcnt vmcnt(0)
731 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
732 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
733 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
735 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
736 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
737 ; GFX7-NEXT: s_endpgm
739 ; GFX8-LABEL: idot2_MixedExt:
740 ; GFX8: ; %bb.0: ; %entry
741 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
742 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
743 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
744 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
745 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
746 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
747 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
748 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
749 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
750 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
751 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
752 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
753 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
754 ; GFX8-NEXT: s_waitcnt vmcnt(1)
755 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
756 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
757 ; GFX8-NEXT: s_waitcnt vmcnt(0)
758 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
759 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
760 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
761 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
762 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
763 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
764 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
765 ; GFX8-NEXT: flat_store_dword v[0:1], v2
766 ; GFX8-NEXT: s_endpgm
768 ; GFX9-NODL-LABEL: idot2_MixedExt:
769 ; GFX9-NODL: ; %bb.0: ; %entry
770 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
771 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
772 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
773 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
774 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
775 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
776 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
777 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
778 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
779 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
780 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
781 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
782 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
783 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
784 ; GFX9-NODL-NEXT: s_endpgm
786 ; GFX9-DL-LABEL: idot2_MixedExt:
787 ; GFX9-DL: ; %bb.0: ; %entry
788 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
789 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
790 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
791 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
792 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
793 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
794 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
795 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
796 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
797 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
798 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
799 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
800 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
801 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
802 ; GFX9-DL-NEXT: s_endpgm
804 ; GFX10-DL-LABEL: idot2_MixedExt:
805 ; GFX10-DL: ; %bb.0: ; %entry
806 ; GFX10-DL-NEXT: s_clause 0x1
807 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
808 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
809 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
810 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
811 ; GFX10-DL-NEXT: s_clause 0x1
812 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
813 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
814 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
815 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
816 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
817 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
818 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
819 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
820 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
821 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
822 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
823 ; GFX10-DL-NEXT: s_endpgm
824 ptr addrspace(1) %src2,
825 ptr addrspace(1) nocapture %dst) {
827 %idx = call i32 @llvm.amdgcn.workitem.id.x()
828 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
829 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
830 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
831 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
833 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
834 %conv = sext i16 %s1.elt1 to i32
835 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
836 %conv2 = zext i16 %s2.elt1 to i32
837 %mul1 = mul nuw i32 %conv2, %conv
839 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
840 %conv3 = sext i16 %s1.elt2 to i32
841 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
842 %conv4 = sext i16 %s2.elt2 to i32
843 %mul2 = mul nuw i32 %conv4, %conv3
845 %s3 = load i32, ptr addrspace(1) %dst, align 4
846 %add = add i32 %mul2, %s3
847 %add6 = add i32 %add, %mul1
848 store i32 %add6, ptr addrspace(1) %dst, align 4
852 define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
853 ; GFX7-LABEL: notudot2_SameVec:
854 ; GFX7: ; %bb.0: ; %entry
855 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
856 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
857 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
858 ; GFX7-NEXT: s_mov_b32 s10, 0
859 ; GFX7-NEXT: s_mov_b32 s11, s7
860 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
861 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
862 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
863 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
864 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
865 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
866 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
867 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
868 ; GFX7-NEXT: s_mov_b32 s6, -1
869 ; GFX7-NEXT: s_waitcnt vmcnt(1)
870 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
871 ; GFX7-NEXT: s_waitcnt vmcnt(0)
872 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
873 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
874 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s0
875 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v1, v0
876 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
877 ; GFX7-NEXT: s_endpgm
879 ; GFX8-LABEL: notudot2_SameVec:
880 ; GFX8: ; %bb.0: ; %entry
881 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
882 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
883 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
884 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
885 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
886 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
887 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
888 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
889 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
890 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
891 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
892 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
893 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
894 ; GFX8-NEXT: s_waitcnt vmcnt(1)
895 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
896 ; GFX8-NEXT: s_waitcnt vmcnt(0)
897 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
898 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
899 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0
900 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0
901 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
902 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
903 ; GFX8-NEXT: flat_store_dword v[0:1], v2
904 ; GFX8-NEXT: s_endpgm
906 ; GFX9-NODL-LABEL: notudot2_SameVec:
907 ; GFX9-NODL: ; %bb.0: ; %entry
908 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
909 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
910 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
911 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
912 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
913 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
914 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
915 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
916 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
917 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
918 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
919 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
920 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
921 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1
922 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
923 ; GFX9-NODL-NEXT: s_endpgm
925 ; GFX9-DL-LABEL: notudot2_SameVec:
926 ; GFX9-DL: ; %bb.0: ; %entry
927 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
928 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
929 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
930 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
931 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
932 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
933 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
934 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
935 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
936 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
937 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
938 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
939 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
940 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1
941 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
942 ; GFX9-DL-NEXT: s_endpgm
944 ; GFX10-DL-LABEL: notudot2_SameVec:
945 ; GFX10-DL: ; %bb.0: ; %entry
946 ; GFX10-DL-NEXT: s_clause 0x1
947 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
948 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
949 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
950 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
951 ; GFX10-DL-NEXT: s_clause 0x1
952 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
953 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
954 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
955 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
956 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
957 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
958 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
959 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
960 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
961 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
962 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
963 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
964 ; GFX10-DL-NEXT: s_endpgm
965 ptr addrspace(1) %src2,
966 ptr addrspace(1) nocapture %dst) {
968 %idx = call i32 @llvm.amdgcn.workitem.id.x()
969 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
970 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
971 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
972 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
974 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
975 %conv = zext i16 %s1.elt1 to i32
976 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
977 %conv2 = zext i16 %s2.elt1 to i32
978 %mul1 = mul i32 %conv2, %conv
980 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
981 %conv3 = zext i16 %s1.elt2 to i32
982 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
983 %conv4 = zext i16 %s2.elt2 to i32
984 %mul2 = mul i32 %conv4, %conv3
986 %s3 = load i32, ptr addrspace(1) %dst, align 4
987 %add = add i32 %mul2, %s3
988 %add6 = add i32 %add, %mul1
989 store i32 %add6, ptr addrspace(1) %dst, align 4
993 define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
994 ; GFX7-LABEL: udot2_v4i16:
995 ; GFX7: ; %bb.0: ; %entry
996 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
997 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
998 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
999 ; GFX7-NEXT: s_mov_b32 s10, 0
1000 ; GFX7-NEXT: s_mov_b32 s11, s7
1001 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1002 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
1003 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1004 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1005 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
1006 ; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
1007 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1008 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1009 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1010 ; GFX7-NEXT: s_mov_b32 s6, -1
1011 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1012 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
1013 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1014 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
1015 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1016 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1017 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0
1019 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
1020 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1021 ; GFX7-NEXT: s_endpgm
1023 ; GFX8-LABEL: udot2_v4i16:
1024 ; GFX8: ; %bb.0: ; %entry
1025 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1026 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1027 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1028 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1029 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1030 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1031 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1032 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1033 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
1034 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1035 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1036 ; GFX8-NEXT: flat_load_dword v1, v[2:3]
1037 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1038 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1039 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
1040 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1041 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
1042 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1043 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1044 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1045 ; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
1046 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
1047 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1048 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1049 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1050 ; GFX8-NEXT: s_endpgm
1052 ; GFX9-NODL-LABEL: udot2_v4i16:
1053 ; GFX9-NODL: ; %bb.0: ; %entry
1054 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1055 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1056 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1057 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1058 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1059 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1060 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1061 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1062 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1063 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1064 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1065 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1066 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
1067 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
1068 ; GFX9-NODL-NEXT: s_endpgm
1070 ; GFX9-DL-LABEL: udot2_v4i16:
1071 ; GFX9-DL: ; %bb.0: ; %entry
1072 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1073 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1074 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1075 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1076 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1077 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1078 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1079 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1080 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1081 ; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1082 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
1083 ; GFX9-DL-NEXT: s_endpgm
1085 ; GFX10-DL-LABEL: udot2_v4i16:
1086 ; GFX10-DL: ; %bb.0: ; %entry
1087 ; GFX10-DL-NEXT: s_clause 0x1
1088 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1089 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1090 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1091 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1092 ; GFX10-DL-NEXT: s_clause 0x1
1093 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1094 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1095 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1096 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1097 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1098 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1099 ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1100 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
1101 ; GFX10-DL-NEXT: s_endpgm
1102 ptr addrspace(1) %src2,
1103 ptr addrspace(1) nocapture %dst) {
1105 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1106 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1107 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1108 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1109 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1111 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1112 %conv = zext i16 %s1.elt1 to i32
1113 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1114 %conv2 = zext i16 %s2.elt1 to i32
1115 %mul1 = mul i32 %conv2, %conv
1117 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
1118 %conv3 = zext i16 %s1.elt2 to i32
1119 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
1120 %conv4 = zext i16 %s2.elt2 to i32
1121 %mul2 = mul i32 %conv4, %conv3
1123 %s3 = load i32, ptr addrspace(1) %dst, align 4
1124 %add = add i32 %mul2, %s3
1125 %add6 = add i32 %add, %mul1
1126 store i32 %add6, ptr addrspace(1) %dst, align 4
1130 define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
1131 ; GFX7-LABEL: udot2_v4i16_Hi:
1132 ; GFX7: ; %bb.0: ; %entry
1133 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1134 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1135 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1136 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1137 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1138 ; GFX7-NEXT: s_mov_b32 s10, 0
1139 ; GFX7-NEXT: s_mov_b32 s11, s7
1140 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1141 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
1142 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
1143 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
1144 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
1145 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1146 ; GFX7-NEXT: s_mov_b32 s6, -1
1147 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1148 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
1149 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1150 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1151 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
1152 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1153 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1154 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0
1155 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
1156 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1157 ; GFX7-NEXT: s_endpgm
1159 ; GFX8-LABEL: udot2_v4i16_Hi:
1160 ; GFX8: ; %bb.0: ; %entry
1161 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1162 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1163 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1164 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1165 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1166 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
1167 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1168 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1169 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0
1170 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1171 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
1172 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1173 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
1174 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
1175 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1176 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1177 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1178 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1179 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
1180 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1181 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1182 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0
1183 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1184 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1185 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0
1186 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0
1187 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1188 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1189 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1190 ; GFX8-NEXT: s_endpgm
1192 ; GFX9-NODL-LABEL: udot2_v4i16_Hi:
1193 ; GFX9-NODL: ; %bb.0: ; %entry
1194 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1195 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1196 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1197 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1198 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
1199 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
1200 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1201 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1202 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1203 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1204 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1205 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1206 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
1207 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
1208 ; GFX9-NODL-NEXT: s_endpgm
1210 ; GFX9-DL-LABEL: udot2_v4i16_Hi:
1211 ; GFX9-DL: ; %bb.0: ; %entry
1212 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1213 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1214 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1215 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1216 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
1217 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
1218 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1219 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1220 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1221 ; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1222 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
1223 ; GFX9-DL-NEXT: s_endpgm
1225 ; GFX10-DL-LABEL: udot2_v4i16_Hi:
1226 ; GFX10-DL: ; %bb.0: ; %entry
1227 ; GFX10-DL-NEXT: s_clause 0x1
1228 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1229 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1230 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1231 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1232 ; GFX10-DL-NEXT: s_clause 0x1
1233 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
1234 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
1235 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1236 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1237 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
1238 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1239 ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1240 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
1241 ; GFX10-DL-NEXT: s_endpgm
1242 ptr addrspace(1) %src2,
1243 ptr addrspace(1) nocapture %dst) {
1245 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1246 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1247 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1248 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1249 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1251 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1252 %conv = zext i16 %s1.elt1 to i32
1253 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1254 %conv2 = zext i16 %s2.elt1 to i32
1255 %mul1 = mul i32 %conv2, %conv
1257 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1258 %conv3 = zext i16 %s1.elt2 to i32
1259 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1260 %conv4 = zext i16 %s2.elt2 to i32
1261 %mul2 = mul i32 %conv4, %conv3
1263 %s3 = load i32, ptr addrspace(1) %dst, align 4
1264 %add = add i32 %mul2, %s3
1265 %add6 = add i32 %add, %mul1
1266 store i32 %add6, ptr addrspace(1) %dst, align 4
1270 define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
1271 ; GFX7-LABEL: notudot2_v4i16_Even:
1272 ; GFX7: ; %bb.0: ; %entry
1273 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1274 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1275 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1276 ; GFX7-NEXT: s_mov_b32 s10, 0
1277 ; GFX7-NEXT: s_mov_b32 s11, s7
1278 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1279 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
1280 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1281 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1282 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
1283 ; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
1284 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1285 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
1286 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1287 ; GFX7-NEXT: s_mov_b32 s6, -1
1288 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1289 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
1290 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1291 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
1292 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
1293 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
1294 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1295 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0
1296 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
1297 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1298 ; GFX7-NEXT: s_endpgm
1300 ; GFX8-LABEL: notudot2_v4i16_Even:
1301 ; GFX8: ; %bb.0: ; %entry
1302 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1303 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1304 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1305 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1306 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1307 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1308 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1309 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1310 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
1311 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1312 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1313 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1314 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1315 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1316 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
1317 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1318 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
1319 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
1320 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
1321 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1322 ; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
1323 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
1324 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1325 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1326 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1327 ; GFX8-NEXT: s_endpgm
1329 ; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1330 ; GFX9-NODL: ; %bb.0: ; %entry
1331 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1332 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1333 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1334 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1335 ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1336 ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1337 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1338 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
1339 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1340 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1341 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1342 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1343 ; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
1344 ; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
1345 ; GFX9-NODL-NEXT: s_endpgm
1347 ; GFX9-DL-LABEL: notudot2_v4i16_Even:
1348 ; GFX9-DL: ; %bb.0: ; %entry
1349 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1350 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1351 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1352 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1353 ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1354 ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1355 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1356 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
1357 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1358 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1359 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1360 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1361 ; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1362 ; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
1363 ; GFX9-DL-NEXT: s_endpgm
1365 ; GFX10-DL-LABEL: notudot2_v4i16_Even:
1366 ; GFX10-DL: ; %bb.0: ; %entry
1367 ; GFX10-DL-NEXT: s_clause 0x1
1368 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1369 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1370 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1371 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1372 ; GFX10-DL-NEXT: s_clause 0x1
1373 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1374 ; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1375 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1376 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1377 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1378 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1379 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1380 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1381 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1382 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1383 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1384 ; GFX10-DL-NEXT: s_endpgm
1385 ptr addrspace(1) %src2,
1386 ptr addrspace(1) nocapture %dst) {
1388 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1389 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1390 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1391 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1392 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1394 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1395 %conv = zext i16 %s1.elt1 to i32
1396 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1397 %conv2 = zext i16 %s2.elt1 to i32
1398 %mul1 = mul i32 %conv2, %conv
1400 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1401 %conv3 = zext i16 %s1.elt2 to i32
1402 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1403 %conv4 = zext i16 %s2.elt2 to i32
1404 %mul2 = mul i32 %conv4, %conv3
1406 %s3 = load i32, ptr addrspace(1) %dst, align 4
1407 %add = add i32 %mul2, %s3
1408 %add6 = add i32 %add, %mul1
1409 store i32 %add6, ptr addrspace(1) %dst, align 4
1413 define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
1414 ; GFX7-LABEL: notudot2_v4i16_Middle:
1415 ; GFX7: ; %bb.0: ; %entry
1416 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1417 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1418 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1419 ; GFX7-NEXT: s_mov_b32 s10, 0
1420 ; GFX7-NEXT: s_mov_b32 s11, s7
1421 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1422 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
1423 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1424 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1425 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
1426 ; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
1427 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1428 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
1429 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1430 ; GFX7-NEXT: s_mov_b32 s6, -1
1431 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1432 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
1433 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1434 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
1435 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1436 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1437 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1438 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0
1439 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
1440 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1441 ; GFX7-NEXT: s_endpgm
1443 ; GFX8-LABEL: notudot2_v4i16_Middle:
1444 ; GFX8: ; %bb.0: ; %entry
1445 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1446 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1447 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1448 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1449 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1450 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1451 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1452 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1453 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
1454 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1455 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1456 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1457 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1458 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1459 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
1460 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1461 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
1462 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1463 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1464 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1465 ; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
1466 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
1467 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1468 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1469 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1470 ; GFX8-NEXT: s_endpgm
1472 ; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1473 ; GFX9-NODL: ; %bb.0: ; %entry
1474 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1475 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1476 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1477 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1478 ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1479 ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1480 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1481 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
1482 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1483 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1484 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1485 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1486 ; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
1487 ; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
1488 ; GFX9-NODL-NEXT: s_endpgm
1490 ; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1491 ; GFX9-DL: ; %bb.0: ; %entry
1492 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1493 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1494 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1495 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1496 ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1497 ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1498 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1499 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
1500 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1501 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1502 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1503 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1504 ; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1505 ; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
1506 ; GFX9-DL-NEXT: s_endpgm
1508 ; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1509 ; GFX10-DL: ; %bb.0: ; %entry
1510 ; GFX10-DL-NEXT: s_clause 0x1
1511 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1512 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1513 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1514 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1515 ; GFX10-DL-NEXT: s_clause 0x1
1516 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1517 ; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1518 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1519 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1520 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1521 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1522 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1523 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1524 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1525 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1526 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1527 ; GFX10-DL-NEXT: s_endpgm
1528 ptr addrspace(1) %src2,
1529 ptr addrspace(1) nocapture %dst) {
1531 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1532 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1533 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1534 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1535 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1537 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1538 %conv = zext i16 %s1.elt1 to i32
1539 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1540 %conv2 = zext i16 %s2.elt1 to i32
1541 %mul1 = mul i32 %conv2, %conv
1543 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1544 %conv3 = zext i16 %s1.elt2 to i32
1545 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1546 %conv4 = zext i16 %s2.elt2 to i32
1547 %mul2 = mul i32 %conv4, %conv3
1549 %s3 = load i32, ptr addrspace(1) %dst, align 4
1550 %add = add i32 %mul2, %s3
1551 %add6 = add i32 %add, %mul1
1552 store i32 %add6, ptr addrspace(1) %dst, align 4
1556 define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
1557 ; GFX7-LABEL: notudot2_DiffIndex:
1558 ; GFX7: ; %bb.0: ; %entry
1559 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1560 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1561 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1562 ; GFX7-NEXT: s_mov_b32 s10, 0
1563 ; GFX7-NEXT: s_mov_b32 s11, s7
1564 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1565 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
1566 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1567 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1568 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1569 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
1570 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1571 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1572 ; GFX7-NEXT: s_mov_b32 s6, -1
1573 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1574 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1575 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
1576 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1577 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1578 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
1579 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1580 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s0
1581 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0
1582 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1583 ; GFX7-NEXT: s_endpgm
1585 ; GFX8-LABEL: notudot2_DiffIndex:
1586 ; GFX8: ; %bb.0: ; %entry
1587 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1588 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1589 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1590 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1591 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1592 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1593 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1595 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1596 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1597 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1598 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1599 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1600 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1601 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
1602 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1603 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1604 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1605 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
1606 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1607 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
1608 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
1609 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1610 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1611 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1612 ; GFX8-NEXT: s_endpgm
1614 ; GFX9-NODL-LABEL: notudot2_DiffIndex:
1615 ; GFX9-NODL: ; %bb.0: ; %entry
1616 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1617 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1618 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1619 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1620 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1621 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1622 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1623 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1624 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1625 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1626 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1627 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1628 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
1629 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
1630 ; GFX9-NODL-NEXT: s_endpgm
1632 ; GFX9-DL-LABEL: notudot2_DiffIndex:
1633 ; GFX9-DL: ; %bb.0: ; %entry
1634 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1635 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1636 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1637 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1638 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1639 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1640 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1641 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1642 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1643 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1644 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1645 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1646 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
1647 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
1648 ; GFX9-DL-NEXT: s_endpgm
1650 ; GFX10-DL-LABEL: notudot2_DiffIndex:
1651 ; GFX10-DL: ; %bb.0: ; %entry
1652 ; GFX10-DL-NEXT: s_clause 0x1
1653 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1654 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1655 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1656 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1657 ; GFX10-DL-NEXT: s_clause 0x1
1658 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1659 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1660 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1661 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1662 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1663 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1664 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1665 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1666 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1667 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1668 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1669 ; GFX10-DL-NEXT: s_endpgm
1670 ptr addrspace(1) %src2,
1671 ptr addrspace(1) nocapture %dst) {
1673 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1674 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1675 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1676 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1677 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
1679 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1680 %conv = zext i16 %s1.elt1 to i32
1681 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1682 %conv2 = zext i16 %s2.elt1 to i32
1683 %mul1 = mul i32 %conv2, %conv
1685 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1686 %conv3 = zext i16 %s1.elt2 to i32
1687 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1688 %conv4 = zext i16 %s2.elt2 to i32
1689 %mul2 = mul i32 %conv4, %conv3
1691 %s3 = load i32, ptr addrspace(1) %dst, align 4
1692 %add = add i32 %mul2, %s3
1693 %add6 = add i32 %add, %mul1
1694 store i32 %add6, ptr addrspace(1) %dst, align 4
1698 define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
1699 ; GFX7-LABEL: udot2_MultipleUses_add1:
1700 ; GFX7: ; %bb.0: ; %entry
1701 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
1702 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
1703 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1704 ; GFX7-NEXT: s_mov_b32 s6, 0
1705 ; GFX7-NEXT: s_mov_b32 s7, s3
1706 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1707 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
1708 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1709 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1710 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1711 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
1712 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1713 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1714 ; GFX7-NEXT: s_mov_b32 s2, -1
1715 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1716 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1717 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
1718 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1719 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1720 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
1721 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1722 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4
1723 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
1724 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1725 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1726 ; GFX7-NEXT: s_endpgm
1728 ; GFX8-LABEL: udot2_MultipleUses_add1:
1729 ; GFX8: ; %bb.0: ; %entry
1730 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1731 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1732 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1733 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1734 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1735 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1736 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1737 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1738 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1739 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1740 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1741 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1742 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1743 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1744 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
1745 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1746 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1747 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
1748 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1749 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1750 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
1751 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
1752 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
1753 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1754 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1755 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1756 ; GFX8-NEXT: s_endpgm
1758 ; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1759 ; GFX9-NODL: ; %bb.0: ; %entry
1760 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1761 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1762 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1763 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1764 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1765 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1766 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1767 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1768 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1769 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1770 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1771 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1772 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1773 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
1774 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
1775 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
1776 ; GFX9-NODL-NEXT: s_endpgm
1778 ; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1779 ; GFX9-DL: ; %bb.0: ; %entry
1780 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1781 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1782 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1783 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1784 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1785 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1786 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1787 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1788 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1789 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1790 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1791 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1792 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1793 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
1794 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
1795 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
1796 ; GFX9-DL-NEXT: s_endpgm
1798 ; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1799 ; GFX10-DL: ; %bb.0: ; %entry
1800 ; GFX10-DL-NEXT: s_clause 0x1
1801 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1802 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1803 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1804 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1805 ; GFX10-DL-NEXT: s_clause 0x1
1806 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1807 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1808 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1809 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1810 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1811 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1812 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1813 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
1814 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1815 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1816 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1817 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
1818 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
1819 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1820 ; GFX10-DL-NEXT: s_endpgm
1821 ptr addrspace(1) %src2,
1822 ptr addrspace(1) nocapture %dst) {
1824 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1825 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1826 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1827 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1828 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
1830 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1831 %conv = zext i16 %s1.elt1 to i32
1832 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1833 %conv2 = zext i16 %s2.elt1 to i32
1834 %mul1 = mul i32 %conv2, %conv
1836 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1837 %conv3 = zext i16 %s1.elt2 to i32
1838 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1839 %conv4 = zext i16 %s2.elt2 to i32
1840 %mul2 = mul i32 %conv4, %conv3
1842 %s3 = load i32, ptr addrspace(1) %dst, align 4
1843 %add1 = add i32 %mul2, %s3
1844 %add2 = add i32 %add1, %mul1
1846 %res = add i32 %add2, %add1
1847 store i32 %res, ptr addrspace(1) %dst, align 4
1851 define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
1852 ; GFX7-LABEL: idot2_MultipleUses_add1:
1853 ; GFX7: ; %bb.0: ; %entry
1854 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
1855 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
1856 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1857 ; GFX7-NEXT: s_mov_b32 s6, 0
1858 ; GFX7-NEXT: s_mov_b32 s7, s3
1859 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
1861 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1862 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1863 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1864 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
1865 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1866 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1867 ; GFX7-NEXT: s_mov_b32 s2, -1
1868 ; GFX7-NEXT: s_waitcnt vmcnt(1)
1869 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
1870 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
1871 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1872 ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
1873 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
1874 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1875 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4
1876 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0
1877 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
1878 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1879 ; GFX7-NEXT: s_endpgm
1881 ; GFX8-LABEL: idot2_MultipleUses_add1:
1882 ; GFX8: ; %bb.0: ; %entry
1883 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1884 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1885 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1886 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1887 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1888 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1889 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1890 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1891 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1892 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1893 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1894 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1895 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1896 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1897 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
1898 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
1899 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1900 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
1901 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
1902 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1903 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
1904 ; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
1905 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
1906 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1907 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1908 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1909 ; GFX8-NEXT: s_endpgm
1911 ; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1912 ; GFX9-NODL: ; %bb.0: ; %entry
1913 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1914 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1915 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1916 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1917 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1918 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1919 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
1920 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
1921 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1922 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1923 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
1924 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
1925 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1926 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
1927 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
1928 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
1929 ; GFX9-NODL-NEXT: s_endpgm
1931 ; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1932 ; GFX9-DL: ; %bb.0: ; %entry
1933 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1934 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1935 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1936 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1937 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1938 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1939 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1940 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
1941 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1942 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1943 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
1944 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
1945 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1946 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
1947 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
1948 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
1949 ; GFX9-DL-NEXT: s_endpgm
1951 ; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1952 ; GFX10-DL: ; %bb.0: ; %entry
1953 ; GFX10-DL-NEXT: s_clause 0x1
1954 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1955 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1956 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1957 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1958 ; GFX10-DL-NEXT: s_clause 0x1
1959 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1960 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1961 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1962 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
1963 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1964 ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
1965 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1966 ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
1967 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1968 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
1969 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1970 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
1971 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
1972 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
1973 ; GFX10-DL-NEXT: s_endpgm
1974 ptr addrspace(1) %src2,
1975 ptr addrspace(1) nocapture %dst) {
1977 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1978 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1979 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1980 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1981 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
1983 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1984 %conv = sext i16 %s1.elt1 to i32
1985 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1986 %conv2 = sext i16 %s2.elt1 to i32
1987 %mul1 = mul i32 %conv2, %conv
1989 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1990 %conv3 = sext i16 %s1.elt2 to i32
1991 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1992 %conv4 = sext i16 %s2.elt2 to i32
1993 %mul2 = mul i32 %conv4, %conv3
1995 %s3 = load i32, ptr addrspace(1) %dst, align 4
1996 %add1 = add i32 %mul2, %s3
1997 %add2 = add i32 %add1, %mul1
1999 %res = add i32 %add2, %add1
2000 store i32 %res, ptr addrspace(1) %dst, align 4
2004 define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
2005 ; GFX7-LABEL: udot2_MultipleUses_mul1:
2006 ; GFX7: ; %bb.0: ; %entry
2007 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
2008 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
2009 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2010 ; GFX7-NEXT: s_mov_b32 s6, 0
2011 ; GFX7-NEXT: s_mov_b32 s7, s3
2012 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2013 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
2014 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2015 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2016 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2017 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
2018 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2019 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
2020 ; GFX7-NEXT: s_mov_b32 s2, -1
2021 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2022 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
2023 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
2024 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2025 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2026 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2027 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2028 ; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4
2029 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
2030 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
2031 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2032 ; GFX7-NEXT: s_endpgm
2034 ; GFX8-LABEL: udot2_MultipleUses_mul1:
2035 ; GFX8: ; %bb.0: ; %entry
2036 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2037 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
2038 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2039 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2040 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2041 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2042 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2043 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2044 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2045 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2046 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2047 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2048 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
2049 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2050 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
2051 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2052 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2053 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
2054 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2055 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2056 ; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0
2057 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
2058 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
2059 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2060 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2061 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2062 ; GFX8-NEXT: s_endpgm
2064 ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
2065 ; GFX9-NODL: ; %bb.0: ; %entry
2066 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2067 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2068 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2069 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2070 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2071 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2072 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
2073 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2074 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2075 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
2076 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2077 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
2078 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2079 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3
2080 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2081 ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0
2082 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2
2083 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
2084 ; GFX9-NODL-NEXT: s_endpgm
2086 ; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
2087 ; GFX9-DL: ; %bb.0: ; %entry
2088 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2089 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2090 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2091 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2092 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2093 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2094 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2095 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2096 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2097 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1
2098 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2099 ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2
2100 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2101 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3
2102 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2103 ; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0
2104 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2
2105 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
2106 ; GFX9-DL-NEXT: s_endpgm
2108 ; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2109 ; GFX10-DL: ; %bb.0: ; %entry
2110 ; GFX10-DL-NEXT: s_clause 0x1
2111 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2112 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2113 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2114 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2115 ; GFX10-DL-NEXT: s_clause 0x1
2116 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2117 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2118 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2119 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2120 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2121 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1
2122 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2123 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2
2124 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2125 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
2126 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2127 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
2128 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2129 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
2130 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
2131 ; GFX10-DL-NEXT: s_endpgm
2132 ptr addrspace(1) %src2,
2133 ptr addrspace(1) nocapture %dst) {
2135 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2136 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2137 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2138 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2139 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2141 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2142 %conv = zext i16 %s1.elt1 to i32
2143 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2144 %conv2 = zext i16 %s2.elt1 to i32
2145 %mul1 = mul i32 %conv2, %conv
2147 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2148 %conv3 = zext i16 %s1.elt2 to i32
2149 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2150 %conv4 = zext i16 %s2.elt2 to i32
2151 %mul2 = mul i32 %conv4, %conv3
2153 %s3 = load i32, ptr addrspace(1) %dst, align 4
2154 %add0 = add i32 %mul1, %s3
2156 %add1 = add i32 %mul2, %add0
2157 %add2 = add i32 %add1, %mul1
2159 store i32 %add2, ptr addrspace(1) %dst, align 4
2163 define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
2164 ; GFX7-LABEL: idot2_MultipleUses_mul1:
2165 ; GFX7: ; %bb.0: ; %entry
2166 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
2167 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
2168 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2169 ; GFX7-NEXT: s_mov_b32 s6, 0
2170 ; GFX7-NEXT: s_mov_b32 s7, s3
2171 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2172 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
2173 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2174 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2175 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2176 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
2177 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2178 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
2179 ; GFX7-NEXT: s_mov_b32 s2, -1
2180 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2181 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
2182 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2183 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2184 ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
2185 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2186 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2187 ; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4
2188 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
2189 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
2190 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2191 ; GFX7-NEXT: s_endpgm
2193 ; GFX8-LABEL: idot2_MultipleUses_mul1:
2194 ; GFX8: ; %bb.0: ; %entry
2195 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2196 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
2197 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2198 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2199 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2200 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2201 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2202 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2203 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2204 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2205 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2206 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2207 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
2208 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2209 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
2210 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
2211 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2212 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
2213 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2214 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2215 ; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0
2216 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
2217 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
2218 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2219 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2220 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2221 ; GFX8-NEXT: s_endpgm
2223 ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
2224 ; GFX9-NODL: ; %bb.0: ; %entry
2225 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2226 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2227 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2228 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2229 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2230 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2231 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
2232 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2233 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2234 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16
2235 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2236 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16
2237 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2238 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3
2239 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2240 ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0
2241 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2
2242 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
2243 ; GFX9-NODL-NEXT: s_endpgm
2245 ; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
2246 ; GFX9-DL: ; %bb.0: ; %entry
2247 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2248 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2249 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2250 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2251 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2252 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2253 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2254 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2255 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2256 ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16
2257 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2258 ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16
2259 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2260 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3
2261 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2262 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0
2263 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2
2264 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
2265 ; GFX9-DL-NEXT: s_endpgm
2267 ; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2268 ; GFX10-DL: ; %bb.0: ; %entry
2269 ; GFX10-DL-NEXT: s_clause 0x1
2270 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2271 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2272 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2273 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2274 ; GFX10-DL-NEXT: s_clause 0x1
2275 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2276 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2277 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2278 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2279 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2280 ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16
2281 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2282 ; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 16
2283 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2284 ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
2285 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2286 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
2287 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2288 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
2289 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
2290 ; GFX10-DL-NEXT: s_endpgm
2291 ptr addrspace(1) %src2,
2292 ptr addrspace(1) nocapture %dst) {
2294 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2295 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2296 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2297 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2298 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2300 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2301 %conv = sext i16 %s1.elt1 to i32
2302 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2303 %conv2 = sext i16 %s2.elt1 to i32
2304 %mul1 = mul i32 %conv2, %conv
2306 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2307 %conv3 = sext i16 %s1.elt2 to i32
2308 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2309 %conv4 = sext i16 %s2.elt2 to i32
2310 %mul2 = mul i32 %conv4, %conv3
2312 %s3 = load i32, ptr addrspace(1) %dst, align 4
2313 %add0 = add i32 %mul1, %s3
2315 %add1 = add i32 %mul2, %add0
2316 %add2 = add i32 %add1, %mul1
2318 store i32 %add2, ptr addrspace(1) %dst, align 4
2322 define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
2323 ; GFX7-LABEL: udot2_MultipleUses_mul2:
2324 ; GFX7: ; %bb.0: ; %entry
2325 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
2326 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
2327 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2328 ; GFX7-NEXT: s_mov_b32 s6, 0
2329 ; GFX7-NEXT: s_mov_b32 s7, s3
2330 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2331 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
2332 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2333 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2334 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2335 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
2336 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2337 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
2338 ; GFX7-NEXT: s_mov_b32 s2, -1
2339 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2340 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
2341 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
2342 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2343 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2344 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2345 ; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4
2346 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2347 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
2348 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
2349 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2350 ; GFX7-NEXT: s_endpgm
2352 ; GFX8-LABEL: udot2_MultipleUses_mul2:
2353 ; GFX8: ; %bb.0: ; %entry
2354 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2355 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
2356 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2357 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2358 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2359 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2360 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2361 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2362 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2363 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2364 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2365 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2366 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
2367 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2368 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
2369 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2370 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2371 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
2372 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2373 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2374 ; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
2375 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
2376 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
2377 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2378 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2379 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2380 ; GFX8-NEXT: s_endpgm
2382 ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
2383 ; GFX9-NODL: ; %bb.0: ; %entry
2384 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2385 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2386 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2387 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2388 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2389 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2390 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
2391 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2392 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2393 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2394 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2395 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2396 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
2397 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2398 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
2399 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
2400 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
2401 ; GFX9-NODL-NEXT: s_endpgm
2403 ; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
2404 ; GFX9-DL: ; %bb.0: ; %entry
2405 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2406 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2407 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2408 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2409 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2410 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2411 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2412 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2413 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2414 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2415 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2416 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2417 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
2418 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2419 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
2420 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
2421 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
2422 ; GFX9-DL-NEXT: s_endpgm
2424 ; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2425 ; GFX10-DL: ; %bb.0: ; %entry
2426 ; GFX10-DL-NEXT: s_clause 0x1
2427 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2428 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2429 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2430 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2431 ; GFX10-DL-NEXT: s_clause 0x1
2432 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2433 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2434 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2435 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2436 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2437 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
2438 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2439 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2440 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2441 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
2442 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2443 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
2444 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2445 ; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
2446 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
2447 ; GFX10-DL-NEXT: s_endpgm
2448 ptr addrspace(1) %src2,
2449 ptr addrspace(1) nocapture %dst) {
2451 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2452 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2453 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2454 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2455 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2457 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2458 %conv = zext i16 %s1.elt1 to i32
2459 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2460 %conv2 = zext i16 %s2.elt1 to i32
2461 %mul1 = mul i32 %conv2, %conv
2463 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2464 %conv3 = zext i16 %s1.elt2 to i32
2465 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2466 %conv4 = zext i16 %s2.elt2 to i32
2467 %mul2 = mul i32 %conv4, %conv3
2469 %s3 = load i32, ptr addrspace(1) %dst, align 4
2470 %add0 = add i32 %mul2, %s3
2472 %add1 = add i32 %mul2, %add0
2473 %add2 = add i32 %add1, %mul1
2475 store i32 %add2, ptr addrspace(1) %dst, align 4
2479 define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
2480 ; GFX7-LABEL: idot2_MultipleUses_mul2:
2481 ; GFX7: ; %bb.0: ; %entry
2482 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
2483 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
2484 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2485 ; GFX7-NEXT: s_mov_b32 s6, 0
2486 ; GFX7-NEXT: s_mov_b32 s7, s3
2487 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2488 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
2489 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2490 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2491 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2492 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
2493 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2494 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
2495 ; GFX7-NEXT: s_mov_b32 s2, -1
2496 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2497 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
2498 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2499 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2500 ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
2501 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2502 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2503 ; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4
2504 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
2505 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
2506 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2507 ; GFX7-NEXT: s_endpgm
2509 ; GFX8-LABEL: idot2_MultipleUses_mul2:
2510 ; GFX8: ; %bb.0: ; %entry
2511 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2512 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
2513 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2514 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2515 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2516 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2517 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2518 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2519 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2520 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2521 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2522 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
2523 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
2524 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2525 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
2526 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
2527 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2528 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
2529 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2530 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2531 ; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
2532 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
2533 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
2534 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2535 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2536 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2537 ; GFX8-NEXT: s_endpgm
2539 ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2540 ; GFX9-NODL: ; %bb.0: ; %entry
2541 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2542 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2543 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2544 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2545 ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2546 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2547 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
2548 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2549 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2550 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2551 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2552 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2553 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
2554 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2555 ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
2556 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
2557 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
2558 ; GFX9-NODL-NEXT: s_endpgm
2560 ; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2561 ; GFX9-DL: ; %bb.0: ; %entry
2562 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2563 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2564 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2565 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2566 ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2567 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2568 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2569 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2570 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2571 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2572 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2573 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2574 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
2575 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2576 ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
2577 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
2578 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
2579 ; GFX9-DL-NEXT: s_endpgm
2581 ; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2582 ; GFX10-DL: ; %bb.0: ; %entry
2583 ; GFX10-DL-NEXT: s_clause 0x1
2584 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2585 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2586 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2587 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2588 ; GFX10-DL-NEXT: s_clause 0x1
2589 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2590 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2591 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2592 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2593 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2594 ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
2595 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2596 ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
2597 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2598 ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
2599 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2600 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
2601 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2602 ; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
2603 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
2604 ; GFX10-DL-NEXT: s_endpgm
2605 ptr addrspace(1) %src2,
2606 ptr addrspace(1) nocapture %dst) {
2608 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2609 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2610 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2611 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2612 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2614 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2615 %conv = sext i16 %s1.elt1 to i32
2616 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2617 %conv2 = sext i16 %s2.elt1 to i32
2618 %mul1 = mul i32 %conv2, %conv
2620 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2621 %conv3 = sext i16 %s1.elt2 to i32
2622 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2623 %conv4 = sext i16 %s2.elt2 to i32
2624 %mul2 = mul i32 %conv4, %conv3
2626 %s3 = load i32, ptr addrspace(1) %dst, align 4
2627 %add0 = add i32 %mul2, %s3
2629 %add1 = add i32 %mul2, %add0
2630 %add2 = add i32 %add1, %mul1
2632 store i32 %add2, ptr addrspace(1) %dst, align 4
2636 define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
2637 ; GFX7-LABEL: udot2_acc16:
2638 ; GFX7: ; %bb.0: ; %entry
2639 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2640 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2641 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2642 ; GFX7-NEXT: s_mov_b32 s10, 0
2643 ; GFX7-NEXT: s_mov_b32 s11, s7
2644 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2645 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
2646 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2647 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2648 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2649 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
2650 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2651 ; GFX7-NEXT: s_mov_b32 s6, -1
2652 ; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0
2653 ; GFX7-NEXT: s_waitcnt vmcnt(2)
2654 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2655 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
2656 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2657 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
2658 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
2659 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2660 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
2661 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
2662 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
2663 ; GFX7-NEXT: s_endpgm
2665 ; GFX8-LABEL: udot2_acc16:
2666 ; GFX8: ; %bb.0: ; %entry
2667 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2668 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
2669 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2670 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2671 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2672 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2673 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2674 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2675 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2676 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2677 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2678 ; GFX8-NEXT: flat_load_dword v2, v[0:1]
2679 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2680 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2681 ; GFX8-NEXT: flat_load_ushort v4, v[0:1]
2682 ; GFX8-NEXT: s_waitcnt vmcnt(2)
2683 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2684 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2685 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
2686 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2687 ; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4
2688 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2689 ; GFX8-NEXT: flat_store_short v[0:1], v2
2690 ; GFX8-NEXT: s_endpgm
2692 ; GFX9-NODL-LABEL: udot2_acc16:
2693 ; GFX9-NODL: ; %bb.0: ; %entry
2694 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2695 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2696 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2697 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
2698 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2699 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
2700 ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3]
2701 ; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[6:7]
2702 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
2703 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
2704 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2705 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2706 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2707 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4
2708 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
2709 ; GFX9-NODL-NEXT: global_store_short v1, v0, s[6:7]
2710 ; GFX9-NODL-NEXT: s_endpgm
2712 ; GFX9-DL-LABEL: udot2_acc16:
2713 ; GFX9-DL: ; %bb.0: ; %entry
2714 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2715 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2716 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2717 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
2718 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2719 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
2720 ; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
2721 ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7]
2722 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2723 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
2724 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2725 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2726 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2727 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4
2728 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
2729 ; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7]
2730 ; GFX9-DL-NEXT: s_endpgm
2732 ; GFX10-DL-LABEL: udot2_acc16:
2733 ; GFX10-DL: ; %bb.0: ; %entry
2734 ; GFX10-DL-NEXT: s_clause 0x1
2735 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2736 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2737 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2738 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
2739 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2740 ; GFX10-DL-NEXT: s_clause 0x1
2741 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
2742 ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
2743 ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7]
2744 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
2745 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
2746 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2747 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2748 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2749 ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
2750 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
2751 ; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7]
2752 ; GFX10-DL-NEXT: s_endpgm
2753 ptr addrspace(1) %src2,
2754 ptr addrspace(1) nocapture %dst) {
2756 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2757 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2758 %v1 = load <2 x i16>, ptr addrspace(1) %gep1
2759 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2760 %v2 = load <2 x i16>, ptr addrspace(1) %gep2
2762 %v1e1 = extractelement <2 x i16> %v1, i64 0
2763 %v2e1 = extractelement <2 x i16> %v2, i64 0
2764 %mul1 = mul i16 %v1e1, %v2e1
2766 %v1e2 = extractelement <2 x i16> %v1, i64 1
2767 %v2e2 = extractelement <2 x i16> %v2, i64 1
2768 %mul2 = mul i16 %v1e2, %v2e2
2770 %s2 = load i16, ptr addrspace(1) %dst, align 2
2771 %add1 = add i16 %mul2, %s2
2772 %add2 = add i16 %add1, %mul1
2773 store i16 %add2, ptr addrspace(1) %dst, align 2
2777 define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
2778 ; GFX7-LABEL: notsdot2_sext8:
2779 ; GFX7: ; %bb.0: ; %entry
2780 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2781 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2782 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2783 ; GFX7-NEXT: s_mov_b32 s10, 0
2784 ; GFX7-NEXT: s_mov_b32 s11, s7
2785 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2786 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
2787 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2788 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
2789 ; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
2790 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
2791 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
2792 ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
2793 ; GFX7-NEXT: s_mov_b32 s6, -1
2794 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2795 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
2796 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
2797 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2798 ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8
2799 ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
2800 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2801 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
2802 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
2803 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
2804 ; GFX7-NEXT: s_endpgm
2806 ; GFX8-LABEL: notsdot2_sext8:
2807 ; GFX8: ; %bb.0: ; %entry
2808 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2809 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
2810 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
2811 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2812 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2813 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2814 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2815 ; GFX8-NEXT: flat_load_ushort v3, v[0:1]
2816 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2817 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2818 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2819 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
2820 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
2821 ; GFX8-NEXT: s_waitcnt vmcnt(1)
2822 ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
2823 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2824 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
2825 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2826 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
2827 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
2828 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
2829 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2830 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
2831 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
2832 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2833 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2834 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2835 ; GFX8-NEXT: s_endpgm
2837 ; GFX9-NODL-LABEL: notsdot2_sext8:
2838 ; GFX9-NODL: ; %bb.0: ; %entry
2839 ; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2840 ; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2841 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2842 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2843 ; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
2844 ; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3]
2845 ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
2846 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
2847 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2848 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2849 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
2850 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
2851 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2852 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2853 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
2854 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
2855 ; GFX9-NODL-NEXT: s_endpgm
2857 ; GFX9-DL-LABEL: notsdot2_sext8:
2858 ; GFX9-DL: ; %bb.0: ; %entry
2859 ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2860 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2861 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2862 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2863 ; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
2864 ; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3]
2865 ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2866 ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001
2867 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
2868 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2869 ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
2870 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2871 ; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
2872 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2873 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0
2874 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
2875 ; GFX9-DL-NEXT: s_endpgm
2877 ; GFX10-DL-LABEL: notsdot2_sext8:
2878 ; GFX10-DL: ; %bb.0: ; %entry
2879 ; GFX10-DL-NEXT: s_clause 0x1
2880 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2881 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
2882 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2883 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2884 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2885 ; GFX10-DL-NEXT: s_clause 0x1
2886 ; GFX10-DL-NEXT: global_load_ushort v1, v0, s[0:1]
2887 ; GFX10-DL-NEXT: global_load_ushort v2, v0, s[2:3]
2888 ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2889 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2890 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2891 ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001
2892 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2893 ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001
2894 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2895 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
2896 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
2897 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7]
2898 ; GFX10-DL-NEXT: s_endpgm
2899 ptr addrspace(1) %src2,
2900 ptr addrspace(1) nocapture %dst) {
2902 %idx = call i32 @llvm.amdgcn.workitem.id.x()
2903 %gep1 = getelementptr <2 x i8>, ptr addrspace(1) %src1, i32 %idx
2904 %vec1 = load <2 x i8>, ptr addrspace(1) %gep1
2905 %gep2 = getelementptr <2 x i8>, ptr addrspace(1) %src2, i32 %idx
2906 %vec2 = load <2 x i8>, ptr addrspace(1) %gep2
2908 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2909 %conv = sext i8 %s1.elt1 to i32
2910 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2911 %conv2 = sext i8 %s2.elt1 to i32
2912 %mul1 = mul nuw i32 %conv2, %conv
2914 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2915 %conv3 = sext i8 %s1.elt2 to i32
2916 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2917 %conv4 = sext i8 %s2.elt2 to i32
2918 %mul2 = mul nuw i32 %conv4, %conv3
2920 %s3 = load i32, ptr addrspace(1) %dst, align 4
2921 %add = add i32 %mul2, %s3
2922 %add6 = add i32 %add, %mul1
2923 store i32 %add6, ptr addrspace(1) %dst, align 4
2927 declare i32 @llvm.amdgcn.workitem.id.x()