1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
8 ; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
10 define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
12 ; GFX7: ; %bb.0: ; %entry
13 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
14 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
15 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
16 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
17 ; GFX7-NEXT: s_mov_b32 s2, -1
18 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
20 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
21 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
22 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
23 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
24 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
25 ; GFX7-NEXT: s_and_b32 s4, s4, s8
26 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
27 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
28 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
29 ; GFX7-NEXT: s_and_b32 s5, s5, s8
30 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
31 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
32 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
36 ; GFX8: ; %bb.0: ; %entry
37 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
38 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
39 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
40 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
42 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
43 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
45 ; GFX8-NEXT: s_and_b32 s6, s3, s2
46 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
47 ; GFX8-NEXT: s_and_b32 s2, s4, s2
48 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
49 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
50 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
51 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
52 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
53 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
54 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
55 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
56 ; GFX8-NEXT: flat_store_dword v[0:1], v2
59 ; GFX9-NODL-LABEL: udot2:
60 ; GFX9-NODL: ; %bb.0: ; %entry
61 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
62 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
63 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
64 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
65 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
66 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
67 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
68 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
70 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
71 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
72 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
73 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
74 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
75 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
76 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
77 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
78 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
79 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
80 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
81 ; GFX9-NODL-NEXT: s_endpgm
83 ; GFX9-DL-LABEL: udot2:
84 ; GFX9-DL: ; %bb.0: ; %entry
85 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
86 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
87 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
88 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
89 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
90 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
91 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
92 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
93 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
95 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
96 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
97 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
98 ; GFX9-DL-NEXT: s_endpgm
99 <2 x i16> addrspace(1)* %src2,
100 i32 addrspace(1)* nocapture %dst) {
102 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
103 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
105 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
106 %conv = zext i16 %s1.elt1 to i32
107 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
108 %conv2 = zext i16 %s2.elt1 to i32
109 %mul1 = mul nuw i32 %conv2, %conv
111 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
112 %conv3 = zext i16 %s1.elt2 to i32
113 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
114 %conv4 = zext i16 %s2.elt2 to i32
115 %mul2 = mul nuw i32 %conv4, %conv3
117 %s3 = load i32, i32 addrspace(1)* %dst, align 4
118 %add = add i32 %mul2, %s3
119 %add6 = add i32 %add, %mul1
120 store i32 %add6, i32 addrspace(1)* %dst, align 4
124 ; TODO: Support this pattern
126 ; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
127 define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
128 ; GFX7-LABEL: udot2_MulMul:
129 ; GFX7: ; %bb.0: ; %entry
130 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
131 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
132 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
133 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
134 ; GFX7-NEXT: s_mov_b32 s2, -1
135 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
137 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
138 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
139 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
141 ; GFX7-NEXT: s_and_b32 s4, s4, s8
142 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
143 ; GFX7-NEXT: s_and_b32 s5, s5, s8
144 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
145 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, s5, v0
146 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
147 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
148 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s6, v0
149 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
150 ; GFX7-NEXT: s_endpgm
152 ; GFX8-LABEL: udot2_MulMul:
153 ; GFX8: ; %bb.0: ; %entry
154 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
155 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
156 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
157 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
159 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
160 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
161 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX8-NEXT: s_and_b32 s6, s3, s2
163 ; GFX8-NEXT: s_and_b32 s2, s4, s2
164 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
165 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
166 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
167 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
168 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, s2, v0
169 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
170 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s5, v0
171 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
172 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
173 ; GFX8-NEXT: flat_store_dword v[0:1], v2
174 ; GFX8-NEXT: s_endpgm
176 ; GFX9-NODL-LABEL: udot2_MulMul:
177 ; GFX9-NODL: ; %bb.0: ; %entry
178 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
179 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
180 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
181 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
182 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
183 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
184 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
185 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
187 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
188 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
189 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
190 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
191 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
192 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
193 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
194 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0
195 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
196 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
197 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
198 ; GFX9-NODL-NEXT: s_endpgm
200 ; GFX9-DL-LABEL: udot2_MulMul:
201 ; GFX9-DL: ; %bb.0: ; %entry
202 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
203 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
204 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
205 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
207 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
208 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
209 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
211 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
212 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
213 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
214 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
215 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
216 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
217 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
218 ; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0
219 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
220 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
221 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
222 ; GFX9-DL-NEXT: s_endpgm
223 <2 x i16> addrspace(1)* %src2,
224 i32 addrspace(1)* nocapture %dst) {
226 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
227 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
229 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
230 %conv = zext i16 %s1.elt1 to i32
231 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
232 %conv2 = zext i16 %s2.elt1 to i32
233 %mul1 = mul nuw i32 %conv2, %conv
235 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
236 %conv3 = zext i16 %s1.elt2 to i32
237 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
238 %conv4 = zext i16 %s2.elt2 to i32
239 %mul2 = mul nuw i32 %conv4, %conv3
240 %s3 = load i32, i32 addrspace(1)* %dst, align 4
241 %add = add i32 %mul2, %mul1
242 %add6 = add i32 %add, %s3
243 store i32 %add6, i32 addrspace(1)* %dst, align 4
247 define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
249 ; GFX7: ; %bb.0: ; %entry
250 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
251 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
252 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
253 ; GFX7-NEXT: s_mov_b32 s2, -1
254 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
256 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
257 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
258 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX7-NEXT: s_sext_i32_i16 s7, s4
260 ; GFX7-NEXT: s_ashr_i32 s4, s4, 16
261 ; GFX7-NEXT: s_sext_i32_i16 s8, s5
262 ; GFX7-NEXT: s_ashr_i32 s5, s5, 16
263 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
264 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
265 ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
266 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
267 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
268 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
269 ; GFX7-NEXT: s_endpgm
272 ; GFX8: ; %bb.0: ; %entry
273 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
274 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
275 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
277 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
278 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
279 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
280 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
281 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX8-NEXT: s_sext_i32_i16 s0, s2
283 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16
284 ; GFX8-NEXT: s_sext_i32_i16 s1, s3
285 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16
286 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
287 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
288 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
289 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
290 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
291 ; GFX8-NEXT: flat_store_dword v[0:1], v2
292 ; GFX8-NEXT: s_endpgm
294 ; GFX9-NODL-LABEL: idot2:
295 ; GFX9-NODL: ; %bb.0: ; %entry
296 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
297 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
298 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
299 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
300 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
301 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
302 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
303 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
304 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
306 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
307 ; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
308 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
309 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
310 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
311 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
312 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
313 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
314 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
315 ; GFX9-NODL-NEXT: s_endpgm
317 ; GFX9-DL-LABEL: idot2:
318 ; GFX9-DL: ; %bb.0: ; %entry
319 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
320 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
321 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
323 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
324 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
325 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
326 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
327 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
329 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
330 ; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3
331 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
332 ; GFX9-DL-NEXT: s_endpgm
333 <2 x i16> addrspace(1)* %src2,
334 i32 addrspace(1)* nocapture %dst) {
336 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
337 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
339 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
340 %conv = sext i16 %s1.elt1 to i32
341 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
342 %conv2 = sext i16 %s2.elt1 to i32
343 %mul1 = mul nuw i32 %conv2, %conv
345 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
346 %conv3 = sext i16 %s1.elt2 to i32
347 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
348 %conv4 = sext i16 %s2.elt2 to i32
349 %mul2 = mul nuw i32 %conv4, %conv3
351 %s3 = load i32, i32 addrspace(1)* %dst, align 4
352 %add = add i32 %mul2, %s3
353 %add6 = add i32 %add, %mul1
354 store i32 %add6, i32 addrspace(1)* %dst, align 4
358 define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
359 ; GFX7-LABEL: idot2_MixedTypedMul:
360 ; GFX7: ; %bb.0: ; %entry
361 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
362 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
363 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
364 ; GFX7-NEXT: s_mov_b32 s2, -1
365 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
367 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
368 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
369 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
371 ; GFX7-NEXT: s_lshr_b32 s8, s5, 16
372 ; GFX7-NEXT: s_sext_i32_i16 s4, s4
373 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
374 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
375 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1
376 ; GFX7-NEXT: s_sext_i32_i16 s5, s5
377 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
378 ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0
379 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
380 ; GFX7-NEXT: s_endpgm
382 ; GFX8-LABEL: idot2_MixedTypedMul:
383 ; GFX8: ; %bb.0: ; %entry
384 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
385 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
386 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
387 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
388 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
389 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
390 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
391 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
392 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
393 ; GFX8-NEXT: s_sext_i32_i16 s0, s2
394 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
395 ; GFX8-NEXT: s_sext_i32_i16 s1, s3
396 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
397 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
398 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
399 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
400 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
401 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
402 ; GFX8-NEXT: flat_store_dword v[0:1], v2
403 ; GFX8-NEXT: s_endpgm
405 ; GFX9-NODL-LABEL: idot2_MixedTypedMul:
406 ; GFX9-NODL: ; %bb.0: ; %entry
407 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
408 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
409 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
410 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
411 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
412 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
413 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
414 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
415 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
416 ; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
417 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
418 ; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
419 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
420 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
421 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
422 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
423 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
424 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
425 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
426 ; GFX9-NODL-NEXT: s_endpgm
428 ; GFX9-DL-LABEL: idot2_MixedTypedMul:
429 ; GFX9-DL: ; %bb.0: ; %entry
430 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
431 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
432 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
434 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
435 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
436 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
437 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
438 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
439 ; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
440 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
441 ; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
442 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
443 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
444 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
445 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
446 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
447 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
448 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
449 ; GFX9-DL-NEXT: s_endpgm
450 <2 x i16> addrspace(1)* %src2,
451 i32 addrspace(1)* nocapture %dst) {
453 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
454 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
456 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
457 %conv = sext i16 %s1.elt1 to i32
458 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
459 %conv2 = sext i16 %s2.elt1 to i32
460 %mul1 = mul nuw i32 %conv2, %conv
462 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
463 %conv3 = zext i16 %s1.elt2 to i32
464 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
465 %conv4 = zext i16 %s2.elt2 to i32
466 %mul2 = mul nuw i32 %conv4, %conv3
468 %s3 = load i32, i32 addrspace(1)* %dst, align 4
469 %add = add i32 %mul2, %s3
470 %add6 = add i32 %add, %mul1
471 store i32 %add6, i32 addrspace(1)* %dst, align 4
475 define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
476 ; GFX7-LABEL: udot2_alt_AddOperands:
477 ; GFX7: ; %bb.0: ; %entry
478 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
479 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
480 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
481 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
482 ; GFX7-NEXT: s_mov_b32 s2, -1
483 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
484 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
485 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
486 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
487 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
489 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
490 ; GFX7-NEXT: s_and_b32 s4, s4, s8
491 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
492 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
493 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
494 ; GFX7-NEXT: s_and_b32 s5, s5, s8
495 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
496 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
497 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
498 ; GFX7-NEXT: s_endpgm
500 ; GFX8-LABEL: udot2_alt_AddOperands:
501 ; GFX8: ; %bb.0: ; %entry
502 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
503 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
504 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
505 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
506 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
507 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
508 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
509 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX8-NEXT: s_and_b32 s6, s3, s2
511 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
512 ; GFX8-NEXT: s_and_b32 s2, s4, s2
513 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
514 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
515 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
516 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
517 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
518 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
519 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
520 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
521 ; GFX8-NEXT: flat_store_dword v[0:1], v2
522 ; GFX8-NEXT: s_endpgm
524 ; GFX9-NODL-LABEL: udot2_alt_AddOperands:
525 ; GFX9-NODL: ; %bb.0: ; %entry
526 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
527 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
528 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
529 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
531 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
532 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
533 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
534 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
535 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
536 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
537 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
538 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
539 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
540 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
541 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
542 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
543 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
544 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
545 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
546 ; GFX9-NODL-NEXT: s_endpgm
548 ; GFX9-DL-LABEL: udot2_alt_AddOperands:
549 ; GFX9-DL: ; %bb.0: ; %entry
550 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
551 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
552 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
553 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
554 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
555 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
556 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
557 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
558 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
559 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
560 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
561 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
562 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
563 ; GFX9-DL-NEXT: s_endpgm
564 <2 x i16> addrspace(1)* %src2,
565 i32 addrspace(1)* nocapture %dst) {
567 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
568 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
570 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
571 %conv = zext i16 %s1.elt1 to i32
572 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
573 %conv2 = zext i16 %s2.elt1 to i32
574 %mul1 = mul nuw i32 %conv2, %conv
576 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
577 %conv3 = zext i16 %s1.elt2 to i32
578 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
579 %conv4 = zext i16 %s2.elt2 to i32
580 %mul2 = mul nuw i32 %conv4, %conv3
582 %s3 = load i32, i32 addrspace(1)* %dst, align 4
583 %add = add i32 %s3, %mul2
584 %add6 = add i32 %mul1, %add
585 store i32 %add6, i32 addrspace(1)* %dst, align 4
589 define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
590 ; GFX7-LABEL: idot2_MixedExt:
591 ; GFX7: ; %bb.0: ; %entry
592 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
593 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
594 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
595 ; GFX7-NEXT: s_mov_b32 s2, -1
596 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
598 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
599 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
600 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
601 ; GFX7-NEXT: s_sext_i32_i16 s7, s4
602 ; GFX7-NEXT: s_ashr_i32 s4, s4, 16
603 ; GFX7-NEXT: s_and_b32 s8, s5, 0xffff
604 ; GFX7-NEXT: s_ashr_i32 s5, s5, 16
605 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
606 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
607 ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
608 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
609 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
610 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
611 ; GFX7-NEXT: s_endpgm
613 ; GFX8-LABEL: idot2_MixedExt:
614 ; GFX8: ; %bb.0: ; %entry
615 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
616 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
617 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
618 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
619 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
620 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
621 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
622 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
623 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
624 ; GFX8-NEXT: s_sext_i32_i16 s0, s2
625 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16
626 ; GFX8-NEXT: s_and_b32 s1, s3, 0xffff
627 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16
628 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
629 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
630 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
631 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
632 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
633 ; GFX8-NEXT: flat_store_dword v[0:1], v2
634 ; GFX8-NEXT: s_endpgm
636 ; GFX9-NODL-LABEL: idot2_MixedExt:
637 ; GFX9-NODL: ; %bb.0: ; %entry
638 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
639 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
640 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
641 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
642 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
643 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
644 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
645 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
646 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
648 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
649 ; GFX9-NODL-NEXT: s_and_b32 s1, s3, 0xffff
650 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
651 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
652 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
653 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
654 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
655 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
656 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
657 ; GFX9-NODL-NEXT: s_endpgm
659 ; GFX9-DL-LABEL: idot2_MixedExt:
660 ; GFX9-DL: ; %bb.0: ; %entry
661 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
662 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
663 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
664 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
665 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
666 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
667 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
668 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
669 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
671 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
672 ; GFX9-DL-NEXT: s_and_b32 s1, s3, 0xffff
673 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
674 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
675 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
676 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
677 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
678 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
679 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
680 ; GFX9-DL-NEXT: s_endpgm
681 <2 x i16> addrspace(1)* %src2,
682 i32 addrspace(1)* nocapture %dst) {
684 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
685 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
687 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
688 %conv = sext i16 %s1.elt1 to i32
689 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
690 %conv2 = zext i16 %s2.elt1 to i32
691 %mul1 = mul nuw i32 %conv2, %conv
693 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
694 %conv3 = sext i16 %s1.elt2 to i32
695 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
696 %conv4 = sext i16 %s2.elt2 to i32
697 %mul2 = mul nuw i32 %conv4, %conv3
699 %s3 = load i32, i32 addrspace(1)* %dst, align 4
700 %add = add i32 %mul2, %s3
701 %add6 = add i32 %add, %mul1
702 store i32 %add6, i32 addrspace(1)* %dst, align 4
706 define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
707 ; GFX7-LABEL: notudot2_SameVec:
708 ; GFX7: ; %bb.0: ; %entry
709 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
710 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
711 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
712 ; GFX7-NEXT: s_mov_b32 s2, -1
713 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
714 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
715 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
716 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
717 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
718 ; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
719 ; GFX7-NEXT: s_lshr_b32 s5, s5, 16
720 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
721 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, s5, v0
722 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, s4, v0
723 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
724 ; GFX7-NEXT: s_endpgm
726 ; GFX8-LABEL: notudot2_SameVec:
727 ; GFX8: ; %bb.0: ; %entry
728 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
729 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
730 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
731 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
732 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
733 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
734 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
735 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
736 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
737 ; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
738 ; GFX8-NEXT: s_lshr_b32 s1, s3, 16
739 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
740 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2
741 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2
742 ; GFX8-NEXT: flat_store_dword v[0:1], v2
743 ; GFX8-NEXT: s_endpgm
745 ; GFX9-NODL-LABEL: notudot2_SameVec:
746 ; GFX9-NODL: ; %bb.0: ; %entry
747 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
748 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
749 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
750 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
751 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
752 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
753 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
754 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
755 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
756 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, 0xffff
757 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16
758 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
759 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
760 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
761 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
762 ; GFX9-NODL-NEXT: s_endpgm
764 ; GFX9-DL-LABEL: notudot2_SameVec:
765 ; GFX9-DL: ; %bb.0: ; %entry
766 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
767 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
768 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
769 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
770 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
771 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
772 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
773 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
774 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
775 ; GFX9-DL-NEXT: s_and_b32 s0, s2, 0xffff
776 ; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16
777 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
778 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
779 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
780 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
781 ; GFX9-DL-NEXT: s_endpgm
782 <2 x i16> addrspace(1)* %src2,
783 i32 addrspace(1)* nocapture %dst) {
785 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
786 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
788 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
789 %conv = zext i16 %s1.elt1 to i32
790 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
791 %conv2 = zext i16 %s2.elt1 to i32
792 %mul1 = mul i32 %conv2, %conv
794 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
795 %conv3 = zext i16 %s1.elt2 to i32
796 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
797 %conv4 = zext i16 %s2.elt2 to i32
798 %mul2 = mul i32 %conv4, %conv3
800 %s3 = load i32, i32 addrspace(1)* %dst, align 4
801 %add = add i32 %mul2, %s3
802 %add6 = add i32 %add, %mul1
803 store i32 %add6, i32 addrspace(1)* %dst, align 4
807 define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
808 ; GFX7-LABEL: udot2_v4i16:
809 ; GFX7: ; %bb.0: ; %entry
810 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
811 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
812 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
813 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
814 ; GFX7-NEXT: s_mov_b32 s2, -1
815 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
816 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
817 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
818 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
819 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
820 ; GFX7-NEXT: s_and_b32 s7, s4, s8
821 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16
822 ; GFX7-NEXT: s_and_b32 s8, s5, s8
823 ; GFX7-NEXT: s_lshr_b32 s5, s5, 16
824 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
825 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
826 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
827 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
828 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
829 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
830 ; GFX7-NEXT: s_endpgm
832 ; GFX8-LABEL: udot2_v4i16:
833 ; GFX8: ; %bb.0: ; %entry
834 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
835 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
836 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
837 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
838 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
839 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
840 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
841 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
842 ; GFX8-NEXT: s_and_b32 s6, s3, s2
843 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
844 ; GFX8-NEXT: s_and_b32 s2, s4, s2
845 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
846 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
847 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
848 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
849 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
850 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
851 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
852 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
853 ; GFX8-NEXT: flat_store_dword v[0:1], v2
854 ; GFX8-NEXT: s_endpgm
856 ; GFX9-NODL-LABEL: udot2_v4i16:
857 ; GFX9-NODL: ; %bb.0: ; %entry
858 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
859 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
860 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
861 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
862 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
863 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
864 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
865 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
866 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
867 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
868 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
869 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
870 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
871 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
872 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
873 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
874 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
875 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
876 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
877 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
878 ; GFX9-NODL-NEXT: s_endpgm
880 ; GFX9-DL-LABEL: udot2_v4i16:
881 ; GFX9-DL: ; %bb.0: ; %entry
882 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
883 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
884 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
885 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
886 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
887 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
888 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
889 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
890 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
891 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
892 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
893 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
894 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
895 ; GFX9-DL-NEXT: s_endpgm
896 <4 x i16> addrspace(1)* %src2,
897 i32 addrspace(1)* nocapture %dst) {
899 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
900 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
902 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
903 %conv = zext i16 %s1.elt1 to i32
904 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
905 %conv2 = zext i16 %s2.elt1 to i32
906 %mul1 = mul i32 %conv2, %conv
908 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
909 %conv3 = zext i16 %s1.elt2 to i32
910 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
911 %conv4 = zext i16 %s2.elt2 to i32
912 %mul2 = mul i32 %conv4, %conv3
914 %s3 = load i32, i32 addrspace(1)* %dst, align 4
915 %add = add i32 %mul2, %s3
916 %add6 = add i32 %add, %mul1
917 store i32 %add6, i32 addrspace(1)* %dst, align 4
921 define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
922 ; GFX7-LABEL: udot2_v4i16_Hi:
923 ; GFX7: ; %bb.0: ; %entry
924 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
925 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
926 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
927 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
928 ; GFX7-NEXT: s_mov_b32 s2, -1
929 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
930 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x1
931 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1
932 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
933 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
934 ; GFX7-NEXT: s_and_b32 s7, s4, s8
935 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16
936 ; GFX7-NEXT: s_and_b32 s8, s5, s8
937 ; GFX7-NEXT: s_lshr_b32 s5, s5, 16
938 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
939 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
940 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
941 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
942 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
943 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
944 ; GFX7-NEXT: s_endpgm
946 ; GFX8-LABEL: udot2_v4i16_Hi:
947 ; GFX8: ; %bb.0: ; %entry
948 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
949 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
950 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
951 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
952 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x4
953 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x4
954 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
955 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
956 ; GFX8-NEXT: s_and_b32 s6, s3, s2
957 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
958 ; GFX8-NEXT: s_and_b32 s2, s4, s2
959 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
960 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
961 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
962 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
963 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
964 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
965 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
966 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
967 ; GFX8-NEXT: flat_store_dword v[0:1], v2
968 ; GFX8-NEXT: s_endpgm
970 ; GFX9-NODL-LABEL: udot2_v4i16_Hi:
971 ; GFX9-NODL: ; %bb.0: ; %entry
972 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
973 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
974 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
975 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
976 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4
977 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4
978 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
979 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
980 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
981 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
982 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
983 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
984 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
985 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
986 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
987 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
988 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
989 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
990 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
991 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
992 ; GFX9-NODL-NEXT: s_endpgm
994 ; GFX9-DL-LABEL: udot2_v4i16_Hi:
995 ; GFX9-DL: ; %bb.0: ; %entry
996 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
997 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
998 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
999 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4
1000 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x4
1001 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1002 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1003 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1004 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1005 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
1006 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
1007 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
1008 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1009 ; GFX9-DL-NEXT: s_endpgm
1010 <4 x i16> addrspace(1)* %src2,
1011 i32 addrspace(1)* nocapture %dst) {
1013 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1014 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1016 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1017 %conv = zext i16 %s1.elt1 to i32
1018 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1019 %conv2 = zext i16 %s2.elt1 to i32
1020 %mul1 = mul i32 %conv2, %conv
1022 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1023 %conv3 = zext i16 %s1.elt2 to i32
1024 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1025 %conv4 = zext i16 %s2.elt2 to i32
1026 %mul2 = mul i32 %conv4, %conv3
1028 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1029 %add = add i32 %mul2, %s3
1030 %add6 = add i32 %add, %mul1
1031 store i32 %add6, i32 addrspace(1)* %dst, align 4
1035 define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
1036 ; GFX7-LABEL: notudot2_v4i16_Even:
1037 ; GFX7: ; %bb.0: ; %entry
1038 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1039 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1040 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
1041 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1042 ; GFX7-NEXT: s_mov_b32 s2, -1
1043 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1044 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1045 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
1046 ; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0
1047 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1048 ; GFX7-NEXT: s_and_b32 s5, s5, s8
1049 ; GFX7-NEXT: s_and_b32 s4, s4, s8
1050 ; GFX7-NEXT: s_and_b32 s7, s7, s8
1051 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
1052 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
1053 ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1054 ; GFX7-NEXT: s_and_b32 s6, s6, s8
1055 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
1056 ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
1057 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1058 ; GFX7-NEXT: s_endpgm
1060 ; GFX8-LABEL: notudot2_v4i16_Even:
1061 ; GFX8: ; %bb.0: ; %entry
1062 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1063 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1064 ; GFX8-NEXT: s_mov_b32 s8, 0xffff
1065 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1066 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1067 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1068 ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0
1069 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GFX8-NEXT: s_and_b32 s3, s3, s8
1071 ; GFX8-NEXT: s_and_b32 s2, s2, s8
1072 ; GFX8-NEXT: s_and_b32 s5, s5, s8
1073 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
1074 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1075 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1076 ; GFX8-NEXT: s_and_b32 s4, s4, s8
1077 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
1078 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1079 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1080 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1081 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1082 ; GFX8-NEXT: s_endpgm
1084 ; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1085 ; GFX9-NODL: ; %bb.0: ; %entry
1086 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1087 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1088 ; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff
1089 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1090 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1091 ; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1092 ; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0
1093 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1094 ; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8
1095 ; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8
1096 ; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8
1097 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
1098 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1099 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1100 ; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8
1101 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
1102 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1103 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1104 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1105 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1106 ; GFX9-NODL-NEXT: s_endpgm
1108 ; GFX9-DL-LABEL: notudot2_v4i16_Even:
1109 ; GFX9-DL: ; %bb.0: ; %entry
1110 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1111 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1112 ; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff
1113 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1114 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1115 ; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1116 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
1117 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1118 ; GFX9-DL-NEXT: s_and_b32 s3, s3, s8
1119 ; GFX9-DL-NEXT: s_and_b32 s2, s2, s8
1120 ; GFX9-DL-NEXT: s_and_b32 s5, s5, s8
1121 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
1122 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1123 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1124 ; GFX9-DL-NEXT: s_and_b32 s4, s4, s8
1125 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
1126 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1127 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1128 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1129 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1130 ; GFX9-DL-NEXT: s_endpgm
1131 <4 x i16> addrspace(1)* %src2,
1132 i32 addrspace(1)* nocapture %dst) {
1134 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1135 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1137 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1138 %conv = zext i16 %s1.elt1 to i32
1139 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1140 %conv2 = zext i16 %s2.elt1 to i32
1141 %mul1 = mul i32 %conv2, %conv
1143 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1144 %conv3 = zext i16 %s1.elt2 to i32
1145 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1146 %conv4 = zext i16 %s2.elt2 to i32
1147 %mul2 = mul i32 %conv4, %conv3
1149 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1150 %add = add i32 %mul2, %s3
1151 %add6 = add i32 %add, %mul1
1152 store i32 %add6, i32 addrspace(1)* %dst, align 4
1156 define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
1157 ; GFX7-LABEL: notudot2_v4i16_Middle:
1158 ; GFX7: ; %bb.0: ; %entry
1159 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1160 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1161 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
1162 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1163 ; GFX7-NEXT: s_mov_b32 s2, -1
1164 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1165 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1166 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
1167 ; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0
1168 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX7-NEXT: s_and_b32 s5, s5, s8
1170 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16
1171 ; GFX7-NEXT: s_and_b32 s7, s7, s8
1172 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
1173 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
1174 ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1175 ; GFX7-NEXT: s_lshr_b32 s6, s6, 16
1176 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
1177 ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
1178 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1179 ; GFX7-NEXT: s_endpgm
1181 ; GFX8-LABEL: notudot2_v4i16_Middle:
1182 ; GFX8: ; %bb.0: ; %entry
1183 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1184 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1185 ; GFX8-NEXT: s_mov_b32 s8, 0xffff
1186 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1187 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1188 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1189 ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0
1190 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1191 ; GFX8-NEXT: s_and_b32 s3, s3, s8
1192 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
1193 ; GFX8-NEXT: s_and_b32 s5, s5, s8
1194 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
1195 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1196 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1197 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1198 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
1199 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1200 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1201 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1202 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1203 ; GFX8-NEXT: s_endpgm
1205 ; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1206 ; GFX9-NODL: ; %bb.0: ; %entry
1207 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1208 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1209 ; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff
1210 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1211 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1212 ; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1213 ; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0
1214 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1215 ; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8
1216 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
1217 ; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8
1218 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
1219 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1220 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1221 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1222 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
1223 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1224 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1225 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1226 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1227 ; GFX9-NODL-NEXT: s_endpgm
1229 ; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1230 ; GFX9-DL: ; %bb.0: ; %entry
1231 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1232 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1233 ; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff
1234 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1235 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1236 ; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1237 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
1238 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1239 ; GFX9-DL-NEXT: s_and_b32 s3, s3, s8
1240 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
1241 ; GFX9-DL-NEXT: s_and_b32 s5, s5, s8
1242 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
1243 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1244 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1245 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1246 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
1247 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1248 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1249 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1250 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1251 ; GFX9-DL-NEXT: s_endpgm
1252 <4 x i16> addrspace(1)* %src2,
1253 i32 addrspace(1)* nocapture %dst) {
1255 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1256 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1258 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1259 %conv = zext i16 %s1.elt1 to i32
1260 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1261 %conv2 = zext i16 %s2.elt1 to i32
1262 %mul1 = mul i32 %conv2, %conv
1264 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1265 %conv3 = zext i16 %s1.elt2 to i32
1266 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1267 %conv4 = zext i16 %s2.elt2 to i32
1268 %mul2 = mul i32 %conv4, %conv3
1270 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1271 %add = add i32 %mul2, %s3
1272 %add6 = add i32 %add, %mul1
1273 store i32 %add6, i32 addrspace(1)* %dst, align 4
1277 define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
1278 ; GFX7-LABEL: notudot2_DiffIndex:
1279 ; GFX7: ; %bb.0: ; %entry
1280 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1281 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1282 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
1283 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1284 ; GFX7-NEXT: s_mov_b32 s2, -1
1285 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1286 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1287 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1288 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1289 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1290 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1291 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1292 ; GFX7-NEXT: s_and_b32 s4, s4, s8
1293 ; GFX7-NEXT: s_and_b32 s5, s5, s8
1294 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
1295 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
1296 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
1297 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
1298 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
1299 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1300 ; GFX7-NEXT: s_endpgm
1302 ; GFX8-LABEL: notudot2_DiffIndex:
1303 ; GFX8: ; %bb.0: ; %entry
1304 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1305 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1306 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
1307 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1308 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1309 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1310 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1311 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1312 ; GFX8-NEXT: s_and_b32 s6, s3, s2
1313 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1314 ; GFX8-NEXT: s_and_b32 s2, s4, s2
1315 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1316 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1317 ; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1318 ; GFX8-NEXT: s_lshr_b32 s7, s4, 16
1319 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1320 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1321 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1322 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1323 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1324 ; GFX8-NEXT: s_endpgm
1326 ; GFX9-NODL-LABEL: notudot2_DiffIndex:
1327 ; GFX9-NODL: ; %bb.0: ; %entry
1328 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1329 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1330 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1331 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1332 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1333 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1334 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1335 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1336 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1337 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1338 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1339 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1340 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1341 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1342 ; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16
1343 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1344 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1345 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1346 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1347 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1348 ; GFX9-NODL-NEXT: s_endpgm
1350 ; GFX9-DL-LABEL: notudot2_DiffIndex:
1351 ; GFX9-DL: ; %bb.0: ; %entry
1352 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1353 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1354 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1355 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1356 ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1357 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1358 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1359 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1360 ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1361 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1362 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1363 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1364 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1365 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1366 ; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16
1367 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1368 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1369 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1370 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1371 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1372 ; GFX9-DL-NEXT: s_endpgm
1373 <2 x i16> addrspace(1)* %src2,
1374 i32 addrspace(1)* nocapture %dst) {
1376 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1377 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1379 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1380 %conv = zext i16 %s1.elt1 to i32
1381 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1382 %conv2 = zext i16 %s2.elt1 to i32
1383 %mul1 = mul i32 %conv2, %conv
1385 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1386 %conv3 = zext i16 %s1.elt2 to i32
1387 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1388 %conv4 = zext i16 %s2.elt2 to i32
1389 %mul2 = mul i32 %conv4, %conv3
1391 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1392 %add = add i32 %mul2, %s3
1393 %add6 = add i32 %add, %mul1
1394 store i32 %add6, i32 addrspace(1)* %dst, align 4
1398 define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1399 ; GFX7-LABEL: udot2_MultipleUses_add1:
1400 ; GFX7: ; %bb.0: ; %entry
1401 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1402 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1403 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
1404 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1405 ; GFX7-NEXT: s_mov_b32 s2, -1
1406 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1407 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1408 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1409 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1410 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1411 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1412 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1413 ; GFX7-NEXT: s_and_b32 s4, s4, s8
1414 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
1415 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
1416 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1417 ; GFX7-NEXT: s_and_b32 s5, s5, s8
1418 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
1419 ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v1, v0
1420 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1421 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1422 ; GFX7-NEXT: s_endpgm
1424 ; GFX8-LABEL: udot2_MultipleUses_add1:
1425 ; GFX8: ; %bb.0: ; %entry
1426 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1427 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1428 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
1429 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1430 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1431 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1432 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1433 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1434 ; GFX8-NEXT: s_and_b32 s6, s3, s2
1435 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1436 ; GFX8-NEXT: s_and_b32 s2, s4, s2
1437 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1438 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1439 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1440 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1441 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1442 ; GFX8-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1443 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
1444 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1445 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1446 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1447 ; GFX8-NEXT: s_endpgm
1449 ; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1450 ; GFX9-NODL: ; %bb.0: ; %entry
1451 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1452 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1453 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1454 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1455 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1456 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1457 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1458 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1459 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1460 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1461 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1462 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1463 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1464 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1465 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1466 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1467 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1468 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0
1469 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1470 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1471 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1472 ; GFX9-NODL-NEXT: s_endpgm
1474 ; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1475 ; GFX9-DL: ; %bb.0: ; %entry
1476 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1477 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1478 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1479 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1480 ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1481 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1482 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1483 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1484 ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1485 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1486 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1487 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1488 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1489 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1490 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1491 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1492 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1493 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0
1494 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1495 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1496 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1497 ; GFX9-DL-NEXT: s_endpgm
1498 <2 x i16> addrspace(1)* %src2,
1499 i32 addrspace(1)* nocapture %dst) {
1501 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1502 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1504 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1505 %conv = zext i16 %s1.elt1 to i32
1506 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1507 %conv2 = zext i16 %s2.elt1 to i32
1508 %mul1 = mul i32 %conv2, %conv
1510 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1511 %conv3 = zext i16 %s1.elt2 to i32
1512 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1513 %conv4 = zext i16 %s2.elt2 to i32
1514 %mul2 = mul i32 %conv4, %conv3
1516 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1517 %add1 = add i32 %mul2, %s3
1518 %add2 = add i32 %add1, %mul1
1520 %res = add i32 %add2, %add1
1521 store i32 %res, i32 addrspace(1)* %dst, align 4
1525 define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1526 ; GFX7-LABEL: idot2_MultipleUses_add1:
1527 ; GFX7: ; %bb.0: ; %entry
1528 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1529 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1530 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1531 ; GFX7-NEXT: s_mov_b32 s2, -1
1532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1533 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1534 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1535 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1536 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1537 ; GFX7-NEXT: s_sext_i32_i16 s7, s4
1538 ; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1539 ; GFX7-NEXT: s_sext_i32_i16 s8, s5
1540 ; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1541 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
1542 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
1543 ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
1544 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1545 ; GFX7-NEXT: v_mad_i32_i24 v1, s8, v1, v0
1546 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1547 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1548 ; GFX7-NEXT: s_endpgm
1550 ; GFX8-LABEL: idot2_MultipleUses_add1:
1551 ; GFX8: ; %bb.0: ; %entry
1552 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1553 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1554 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1555 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1556 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
1557 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
1558 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1559 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1560 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1561 ; GFX8-NEXT: s_sext_i32_i16 s0, s2
1562 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16
1563 ; GFX8-NEXT: s_sext_i32_i16 s1, s3
1564 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16
1565 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
1566 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
1567 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1568 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
1569 ; GFX8-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1570 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1571 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1572 ; GFX8-NEXT: s_endpgm
1574 ; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1575 ; GFX9-NODL: ; %bb.0: ; %entry
1576 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1577 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1578 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1579 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
1580 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
1581 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
1582 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1583 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1584 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1585 ; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
1586 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
1587 ; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
1588 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
1589 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
1590 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
1591 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1592 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
1593 ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1594 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2
1595 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1596 ; GFX9-NODL-NEXT: s_endpgm
1598 ; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1599 ; GFX9-DL: ; %bb.0: ; %entry
1600 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1601 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1602 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1603 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1604 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1605 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1606 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1607 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1608 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1609 ; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
1610 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
1611 ; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
1612 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
1613 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
1614 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
1615 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1616 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
1617 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1618 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
1619 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1620 ; GFX9-DL-NEXT: s_endpgm
1621 <2 x i16> addrspace(1)* %src2,
1622 i32 addrspace(1)* nocapture %dst) {
1624 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1625 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1627 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1628 %conv = sext i16 %s1.elt1 to i32
1629 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1630 %conv2 = sext i16 %s2.elt1 to i32
1631 %mul1 = mul i32 %conv2, %conv
1633 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1634 %conv3 = sext i16 %s1.elt2 to i32
1635 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1636 %conv4 = sext i16 %s2.elt2 to i32
1637 %mul2 = mul i32 %conv4, %conv3
1639 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1640 %add1 = add i32 %mul2, %s3
1641 %add2 = add i32 %add1, %mul1
1643 %res = add i32 %add2, %add1
1644 store i32 %res, i32 addrspace(1)* %dst, align 4
1648 define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1649 ; GFX7-LABEL: udot2_MultipleUses_mul1:
1650 ; GFX7: ; %bb.0: ; %entry
1651 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1652 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1653 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
1654 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1655 ; GFX7-NEXT: s_mov_b32 s2, -1
1656 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1657 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1658 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1659 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1660 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1661 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1662 ; GFX7-NEXT: s_and_b32 s4, s4, s8
1663 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1664 ; GFX7-NEXT: s_and_b32 s5, s5, s8
1665 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
1666 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
1667 ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v0, v1
1668 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
1669 ; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1
1670 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
1671 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1672 ; GFX7-NEXT: s_endpgm
1674 ; GFX8-LABEL: udot2_MultipleUses_mul1:
1675 ; GFX8: ; %bb.0: ; %entry
1676 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1677 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1678 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
1679 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1680 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1681 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1682 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1683 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1684 ; GFX8-NEXT: s_and_b32 s6, s3, s2
1685 ; GFX8-NEXT: s_and_b32 s2, s4, s2
1686 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1687 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1688 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1689 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1690 ; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1691 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
1692 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1693 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1694 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1695 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1696 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1697 ; GFX8-NEXT: s_endpgm
1699 ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
1700 ; GFX9-NODL: ; %bb.0: ; %entry
1701 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1702 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1703 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1704 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1705 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1706 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1707 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1708 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1709 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1710 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1711 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1712 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1713 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1714 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1715 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1716 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
1717 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1718 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1719 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1720 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1721 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1722 ; GFX9-NODL-NEXT: s_endpgm
1724 ; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
1725 ; GFX9-DL: ; %bb.0: ; %entry
1726 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1727 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1728 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1729 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1730 ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1731 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1732 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1733 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1734 ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1735 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1736 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1737 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1738 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1739 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1740 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1741 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
1742 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1743 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1744 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1745 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1746 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1747 ; GFX9-DL-NEXT: s_endpgm
1748 <2 x i16> addrspace(1)* %src2,
1749 i32 addrspace(1)* nocapture %dst) {
1751 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1752 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1754 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1755 %conv = zext i16 %s1.elt1 to i32
1756 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1757 %conv2 = zext i16 %s2.elt1 to i32
1758 %mul1 = mul i32 %conv2, %conv
1760 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1761 %conv3 = zext i16 %s1.elt2 to i32
1762 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1763 %conv4 = zext i16 %s2.elt2 to i32
1764 %mul2 = mul i32 %conv4, %conv3
1766 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1767 %add0 = add i32 %mul1, %s3
1769 %add1 = add i32 %mul2, %add0
1770 %add2 = add i32 %add1, %mul1
1772 store i32 %add2, i32 addrspace(1)* %dst, align 4
1776 define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1777 ; GFX7-LABEL: idot2_MultipleUses_mul1:
1778 ; GFX7: ; %bb.0: ; %entry
1779 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1780 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1781 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1782 ; GFX7-NEXT: s_mov_b32 s2, -1
1783 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1784 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1785 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1786 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1787 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1788 ; GFX7-NEXT: s_sext_i32_i16 s7, s4
1789 ; GFX7-NEXT: s_sext_i32_i16 s8, s5
1790 ; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1791 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
1792 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
1793 ; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1794 ; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1
1795 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1796 ; GFX7-NEXT: v_mad_i32_i24 v1, s5, v2, v1
1797 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
1798 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1799 ; GFX7-NEXT: s_endpgm
1801 ; GFX8-LABEL: idot2_MultipleUses_mul1:
1802 ; GFX8: ; %bb.0: ; %entry
1803 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1804 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1805 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1806 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1807 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
1808 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
1809 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1810 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1811 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1812 ; GFX8-NEXT: s_sext_i32_i16 s0, s2
1813 ; GFX8-NEXT: s_sext_i32_i16 s1, s3
1814 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16
1815 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
1816 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
1817 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16
1818 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1819 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
1820 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1821 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1822 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1823 ; GFX8-NEXT: s_endpgm
1825 ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
1826 ; GFX9-NODL: ; %bb.0: ; %entry
1827 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1828 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1829 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1830 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
1831 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
1832 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
1833 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1834 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1835 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1836 ; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
1837 ; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
1838 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
1839 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
1840 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
1841 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
1842 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1843 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2
1844 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1845 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1846 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1847 ; GFX9-NODL-NEXT: s_endpgm
1849 ; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
1850 ; GFX9-DL: ; %bb.0: ; %entry
1851 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1852 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1853 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1854 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1855 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1856 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1857 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1858 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1859 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
1861 ; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
1862 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
1863 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
1864 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
1865 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
1866 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1867 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2
1868 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1869 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1870 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1871 ; GFX9-DL-NEXT: s_endpgm
1872 <2 x i16> addrspace(1)* %src2,
1873 i32 addrspace(1)* nocapture %dst) {
1875 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1876 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1878 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1879 %conv = sext i16 %s1.elt1 to i32
1880 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1881 %conv2 = sext i16 %s2.elt1 to i32
1882 %mul1 = mul i32 %conv2, %conv
1884 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1885 %conv3 = sext i16 %s1.elt2 to i32
1886 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1887 %conv4 = sext i16 %s2.elt2 to i32
1888 %mul2 = mul i32 %conv4, %conv3
1890 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1891 %add0 = add i32 %mul1, %s3
1893 %add1 = add i32 %mul2, %add0
1894 %add2 = add i32 %add1, %mul1
1896 store i32 %add2, i32 addrspace(1)* %dst, align 4
1900 define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
1901 ; GFX7-LABEL: udot2_MultipleUses_mul2:
1902 ; GFX7: ; %bb.0: ; %entry
1903 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1904 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1905 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
1906 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1907 ; GFX7-NEXT: s_mov_b32 s2, -1
1908 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1909 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1910 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1911 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1912 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1913 ; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1914 ; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1915 ; GFX7-NEXT: v_mov_b32_e32 v0, s7
1916 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
1917 ; GFX7-NEXT: v_mad_u32_u24 v1, s9, v0, v1
1918 ; GFX7-NEXT: s_and_b32 s4, s4, s8
1919 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1920 ; GFX7-NEXT: s_and_b32 s5, s5, s8
1921 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
1922 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1923 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1924 ; GFX7-NEXT: s_endpgm
1926 ; GFX8-LABEL: udot2_MultipleUses_mul2:
1927 ; GFX8: ; %bb.0: ; %entry
1928 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1929 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1930 ; GFX8-NEXT: s_mov_b32 s2, 0xffff
1931 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1932 ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1933 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1934 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1935 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1936 ; GFX8-NEXT: s_and_b32 s6, s3, s2
1937 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1938 ; GFX8-NEXT: s_and_b32 s2, s4, s2
1939 ; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1940 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1941 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1942 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1943 ; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1944 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1945 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1946 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1947 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1948 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1949 ; GFX8-NEXT: s_endpgm
1951 ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
1952 ; GFX9-NODL: ; %bb.0: ; %entry
1953 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1954 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1955 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1956 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1957 ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1958 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1959 ; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1960 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1961 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1962 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1963 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1964 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1965 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1966 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1967 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1968 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1969 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1970 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1971 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1972 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1973 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1974 ; GFX9-NODL-NEXT: s_endpgm
1976 ; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
1977 ; GFX9-DL: ; %bb.0: ; %entry
1978 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1979 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1980 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1981 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1983 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1984 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1985 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1986 ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1987 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1988 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1989 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1990 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1991 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1992 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1993 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1994 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1995 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1996 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1997 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1998 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1999 ; GFX9-DL-NEXT: s_endpgm
2000 <2 x i16> addrspace(1)* %src2,
2001 i32 addrspace(1)* nocapture %dst) {
2003 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2004 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2006 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2007 %conv = zext i16 %s1.elt1 to i32
2008 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2009 %conv2 = zext i16 %s2.elt1 to i32
2010 %mul1 = mul i32 %conv2, %conv
2012 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2013 %conv3 = zext i16 %s1.elt2 to i32
2014 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2015 %conv4 = zext i16 %s2.elt2 to i32
2016 %mul2 = mul i32 %conv4, %conv3
2018 %s3 = load i32, i32 addrspace(1)* %dst, align 4
2019 %add0 = add i32 %mul2, %s3
2021 %add1 = add i32 %mul2, %add0
2022 %add2 = add i32 %add1, %mul1
2024 store i32 %add2, i32 addrspace(1)* %dst, align 4
2028 define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2029 ; GFX7-LABEL: idot2_MultipleUses_mul2:
2030 ; GFX7: ; %bb.0: ; %entry
2031 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2032 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2033 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2034 ; GFX7-NEXT: s_mov_b32 s2, -1
2035 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2036 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2037 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2038 ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
2039 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2040 ; GFX7-NEXT: s_sext_i32_i16 s7, s4
2041 ; GFX7-NEXT: s_ashr_i32 s4, s4, 16
2042 ; GFX7-NEXT: s_sext_i32_i16 s8, s5
2043 ; GFX7-NEXT: s_ashr_i32 s5, s5, 16
2044 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2045 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
2046 ; GFX7-NEXT: v_mad_i32_i24 v1, s5, v0, v1
2047 ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
2048 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
2049 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
2050 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2051 ; GFX7-NEXT: s_endpgm
2053 ; GFX8-LABEL: idot2_MultipleUses_mul2:
2054 ; GFX8: ; %bb.0: ; %entry
2055 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2056 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2057 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2058 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
2059 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
2060 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
2061 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2062 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2063 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2064 ; GFX8-NEXT: s_sext_i32_i16 s0, s2
2065 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16
2066 ; GFX8-NEXT: s_sext_i32_i16 s1, s3
2067 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16
2068 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2069 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
2070 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2071 ; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2072 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
2073 ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2074 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2075 ; GFX8-NEXT: s_endpgm
2077 ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2078 ; GFX9-NODL: ; %bb.0: ; %entry
2079 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2080 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2081 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2082 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
2083 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
2084 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
2085 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2086 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2087 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2088 ; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
2089 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
2090 ; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
2091 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
2092 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
2093 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
2094 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2095 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2096 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
2097 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2098 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
2099 ; GFX9-NODL-NEXT: s_endpgm
2101 ; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2102 ; GFX9-DL: ; %bb.0: ; %entry
2103 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2104 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2105 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2106 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2107 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2108 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
2109 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2110 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2111 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2112 ; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
2113 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
2114 ; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
2115 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
2116 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
2117 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
2118 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2119 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2120 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
2121 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2122 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2123 ; GFX9-DL-NEXT: s_endpgm
2124 <2 x i16> addrspace(1)* %src2,
2125 i32 addrspace(1)* nocapture %dst) {
2127 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2128 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2130 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2131 %conv = sext i16 %s1.elt1 to i32
2132 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2133 %conv2 = sext i16 %s2.elt1 to i32
2134 %mul1 = mul i32 %conv2, %conv
2136 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2137 %conv3 = sext i16 %s1.elt2 to i32
2138 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2139 %conv4 = sext i16 %s2.elt2 to i32
2140 %mul2 = mul i32 %conv4, %conv3
2142 %s3 = load i32, i32 addrspace(1)* %dst, align 4
2143 %add0 = add i32 %mul2, %s3
2145 %add1 = add i32 %mul2, %add0
2146 %add2 = add i32 %add1, %mul1
2148 store i32 %add2, i32 addrspace(1)* %dst, align 4
2152 define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
2153 ; GFX7-LABEL: udot2_acc16:
2154 ; GFX7: ; %bb.0: ; %entry
2155 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2156 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2157 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2158 ; GFX7-NEXT: s_mov_b32 s2, -1
2159 ; GFX7-NEXT: s_mov_b32 s8, 0xffff
2160 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2161 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2162 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
2163 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2164 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2165 ; GFX7-NEXT: s_lshr_b32 s6, s4, 16
2166 ; GFX7-NEXT: s_and_b32 s4, s4, s8
2167 ; GFX7-NEXT: s_lshr_b32 s7, s5, 16
2168 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
2169 ; GFX7-NEXT: s_and_b32 s5, s5, s8
2170 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2171 ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
2172 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2173 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
2174 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2175 ; GFX7-NEXT: s_endpgm
2177 ; GFX8-LABEL: udot2_acc16:
2178 ; GFX8: ; %bb.0: ; %entry
2179 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2180 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2181 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2183 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2184 ; GFX8-NEXT: flat_load_ushort v2, v[0:1]
2185 ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
2186 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
2187 ; GFX8-NEXT: s_mov_b32 s0, 0xffff
2188 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2189 ; GFX8-NEXT: s_and_b32 s3, s1, s0
2190 ; GFX8-NEXT: s_and_b32 s0, s2, s0
2191 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
2192 ; GFX8-NEXT: s_lshr_b32 s1, s1, 16
2193 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
2194 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2195 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
2196 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
2197 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
2198 ; GFX8-NEXT: flat_store_short v[0:1], v2
2199 ; GFX8-NEXT: s_endpgm
2201 ; GFX9-NODL-LABEL: udot2_acc16:
2202 ; GFX9-NODL: ; %bb.0: ; %entry
2203 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2204 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2205 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2206 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2207 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2208 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
2209 ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
2210 ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
2211 ; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
2212 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2213 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
2214 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
2215 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
2216 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
2217 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
2218 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2219 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
2220 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
2221 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
2222 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
2223 ; GFX9-NODL-NEXT: s_endpgm
2225 ; GFX9-DL-LABEL: udot2_acc16:
2226 ; GFX9-DL: ; %bb.0: ; %entry
2227 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2228 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2229 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2230 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2231 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2232 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2233 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2234 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
2235 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2236 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
2237 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2238 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
2239 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
2240 ; GFX9-DL-NEXT: s_endpgm
2241 <2 x i16> addrspace(1)* %src2,
2242 i16 addrspace(1)* nocapture %dst) {
2244 %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2245 %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2247 %v1e1 = extractelement <2 x i16> %v1, i64 0
2248 %v2e1 = extractelement <2 x i16> %v2, i64 0
2249 %mul1 = mul i16 %v1e1, %v2e1
2251 %v1e2 = extractelement <2 x i16> %v1, i64 1
2252 %v2e2 = extractelement <2 x i16> %v2, i64 1
2253 %mul2 = mul i16 %v1e2, %v2e2
2255 %s2 = load i16, i16 addrspace(1)* %dst, align 2
2256 %add1 = add i16 %mul2, %s2
2257 %add2 = add i16 %add1, %mul1
2258 store i16 %add2, i16 addrspace(1)* %dst, align 2
2262 define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
2263 ; GFX7-LABEL: notsdot2_sext8:
2264 ; GFX7: ; %bb.0: ; %entry
2265 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2266 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2267 ; GFX7-NEXT: s_mov_b32 s2, -1
2268 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2269 ; GFX7-NEXT: s_mov_b32 s10, s2
2270 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2271 ; GFX7-NEXT: s_mov_b32 s8, s6
2272 ; GFX7-NEXT: s_mov_b32 s9, s7
2273 ; GFX7-NEXT: s_mov_b32 s11, s3
2274 ; GFX7-NEXT: s_mov_b32 s6, s2
2275 ; GFX7-NEXT: s_mov_b32 s7, s3
2276 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
2277 ; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0
2278 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
2279 ; GFX7-NEXT: s_waitcnt vmcnt(1)
2280 ; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8
2281 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2282 ; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8
2283 ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
2284 ; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8
2285 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2286 ; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s4
2287 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0
2288 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2289 ; GFX7-NEXT: s_endpgm
2291 ; GFX8-LABEL: notsdot2_sext8:
2292 ; GFX8: ; %bb.0: ; %entry
2293 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2294 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2295 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2296 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
2297 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
2298 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2299 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2300 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
2301 ; GFX8-NEXT: flat_load_ushort v2, v[2:3]
2302 ; GFX8-NEXT: flat_load_ushort v3, v[0:1]
2303 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2304 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2305 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0)
2306 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 8, v2
2307 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2308 ; GFX8-NEXT: v_bfe_i32 v5, v3, 0, 8
2309 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2310 ; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
2311 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
2312 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
2313 ; GFX8-NEXT: v_mad_i32_i24 v3, v3, v4, s2
2314 ; GFX8-NEXT: v_mad_i32_i24 v2, v5, v2, v3
2315 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2316 ; GFX8-NEXT: s_endpgm
2318 ; GFX9-NODL-LABEL: notsdot2_sext8:
2319 ; GFX9-NODL: ; %bb.0: ; %entry
2320 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2321 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2322 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2323 ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
2324 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
2325 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
2326 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
2327 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
2328 ; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off
2329 ; GFX9-NODL-NEXT: global_load_ushort v3, v[0:1], off
2330 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2331 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2332 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2333 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2
2334 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2335 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v3, 0, 8
2336 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2337 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8
2338 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
2339 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
2340 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2341 ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2
2342 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v5, v2, v3
2343 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
2344 ; GFX9-NODL-NEXT: s_endpgm
2346 ; GFX9-DL-LABEL: notsdot2_sext8:
2347 ; GFX9-DL: ; %bb.0: ; %entry
2348 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2349 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2350 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2351 ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
2352 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
2353 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
2354 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
2355 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
2356 ; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off
2357 ; GFX9-DL-NEXT: global_load_ushort v3, v[0:1], off
2358 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2359 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2360 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2361 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2
2362 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2363 ; GFX9-DL-NEXT: v_bfe_i32 v5, v3, 0, 8
2364 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2365 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
2366 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
2367 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
2368 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2369 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2
2370 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3
2371 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2372 ; GFX9-DL-NEXT: s_endpgm
2373 <2 x i8> addrspace(1)* %src2,
2374 i32 addrspace(1)* nocapture %dst) {
2376 %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1
2377 %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2
2379 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2380 %conv = sext i8 %s1.elt1 to i32
2381 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2382 %conv2 = sext i8 %s2.elt1 to i32
2383 %mul1 = mul nuw i32 %conv2, %conv
2385 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2386 %conv3 = sext i8 %s1.elt2 to i32
2387 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2388 %conv4 = sext i8 %s2.elt2 to i32
2389 %mul2 = mul nuw i32 %conv4, %conv3
2391 %s3 = load i32, i32 addrspace(1)* %dst, align 4
2392 %add = add i32 %mul2, %s3
2393 %add6 = add i32 %add, %mul1
2394 store i32 %add6, i32 addrspace(1)* %dst, align 4