1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOSDWA %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX89 %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX10 %s
7 define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
8 ; NOSDWA-LABEL: add_shr_i32:
10 ; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
11 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
12 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
13 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
14 ; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
15 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
16 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
17 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
18 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
19 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v2
20 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
21 ; NOSDWA-NEXT: s_endpgm
23 ; GFX89-LABEL: add_shr_i32:
25 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
26 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
27 ; GFX89-NEXT: v_mov_b32_e32 v0, s2
28 ; GFX89-NEXT: v_mov_b32_e32 v1, s3
29 ; GFX89-NEXT: flat_load_dword v2, v[0:1]
30 ; GFX89-NEXT: v_mov_b32_e32 v0, s0
31 ; GFX89-NEXT: v_mov_b32_e32 v1, s1
32 ; GFX89-NEXT: s_waitcnt vmcnt(0)
33 ; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
34 ; GFX89-NEXT: flat_store_dword v[0:1], v2
35 ; GFX89-NEXT: s_endpgm
37 ; GFX9-LABEL: add_shr_i32:
39 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
40 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
43 ; GFX9-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
45 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
48 ; GFX10-LABEL: add_shr_i32:
50 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
51 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
52 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
54 ; GFX10-NEXT: s_waitcnt vmcnt(0)
55 ; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
56 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
57 ; GFX10-NEXT: s_endpgm
58 %a = load i32, ptr addrspace(1) %in, align 4
59 %shr = lshr i32 %a, 16
60 %add = add i32 %a, %shr
61 store i32 %add, ptr addrspace(1) %out, align 4
65 define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
66 ; NOSDWA-LABEL: sub_shr_i32:
68 ; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
69 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
70 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
71 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
72 ; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
73 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
74 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
75 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
76 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
77 ; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2
78 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
79 ; NOSDWA-NEXT: s_endpgm
81 ; GFX89-LABEL: sub_shr_i32:
83 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
84 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX89-NEXT: v_mov_b32_e32 v0, s2
86 ; GFX89-NEXT: v_mov_b32_e32 v1, s3
87 ; GFX89-NEXT: flat_load_dword v2, v[0:1]
88 ; GFX89-NEXT: v_mov_b32_e32 v0, s0
89 ; GFX89-NEXT: v_mov_b32_e32 v1, s1
90 ; GFX89-NEXT: s_waitcnt vmcnt(0)
91 ; GFX89-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
92 ; GFX89-NEXT: flat_store_dword v[0:1], v2
93 ; GFX89-NEXT: s_endpgm
95 ; GFX9-LABEL: sub_shr_i32:
97 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
98 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
99 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
101 ; GFX9-NEXT: s_waitcnt vmcnt(0)
102 ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
103 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
104 ; GFX9-NEXT: s_endpgm
106 ; GFX10-LABEL: sub_shr_i32:
108 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
109 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
110 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
112 ; GFX10-NEXT: s_waitcnt vmcnt(0)
113 ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
114 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
115 ; GFX10-NEXT: s_endpgm
116 %a = load i32, ptr addrspace(1) %in, align 4
117 %shr = lshr i32 %a, 16
118 %sub = sub i32 %shr, %a
119 store i32 %sub, ptr addrspace(1) %out, align 4
123 define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
124 ; NOSDWA-LABEL: mul_shr_i32:
126 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
127 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
128 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
129 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
130 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
131 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
132 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
133 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
134 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
135 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
136 ; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
137 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
138 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
139 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
140 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
141 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4
142 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
143 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
144 ; NOSDWA-NEXT: v_mul_u32_u24_e32 v2, v3, v2
145 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
146 ; NOSDWA-NEXT: s_endpgm
148 ; GFX89-LABEL: mul_shr_i32:
150 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
151 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
152 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
153 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
155 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
156 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
157 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
158 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
159 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
160 ; GFX89-NEXT: flat_load_dword v4, v[0:1]
161 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
162 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
163 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
164 ; GFX89-NEXT: s_waitcnt vmcnt(0)
165 ; GFX89-NEXT: v_mul_u32_u24_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
166 ; GFX89-NEXT: flat_store_dword v[0:1], v2
167 ; GFX89-NEXT: s_endpgm
169 ; GFX9-LABEL: mul_shr_i32:
171 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
172 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
173 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
174 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
175 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
176 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
177 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
178 ; GFX9-NEXT: s_waitcnt vmcnt(0)
179 ; GFX9-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
180 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
181 ; GFX9-NEXT: s_endpgm
183 ; GFX10-LABEL: mul_shr_i32:
185 ; GFX10-NEXT: s_clause 0x1
186 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
187 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
188 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
189 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX10-NEXT: s_clause 0x1
191 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
192 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
193 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
194 ; GFX10-NEXT: s_waitcnt vmcnt(0)
195 ; GFX10-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
196 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
197 ; GFX10-NEXT: s_endpgm
198 %idx = call i32 @llvm.amdgcn.workitem.id.x()
199 %gep1 = getelementptr i32, ptr addrspace(1) %in1, i32 %idx
200 %gep2 = getelementptr i32, ptr addrspace(1) %in2, i32 %idx
201 %a = load i32, ptr addrspace(1) %gep1, align 4
202 %b = load i32, ptr addrspace(1) %gep2, align 4
203 %shra = lshr i32 %a, 16
204 %shrb = lshr i32 %b, 16
205 %mul = mul i32 %shra, %shrb
206 store i32 %mul, ptr addrspace(1) %out, align 4
210 define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
211 ; NOSDWA-LABEL: mul_i16:
212 ; NOSDWA: ; %bb.0: ; %entry
213 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
214 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
215 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
216 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
217 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
218 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
219 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
220 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
221 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
222 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
223 ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
224 ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
225 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
226 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
227 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
228 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2
229 ; NOSDWA-NEXT: flat_store_short v[0:1], v2
230 ; NOSDWA-NEXT: s_endpgm
232 ; GFX89-LABEL: mul_i16:
233 ; GFX89: ; %bb.0: ; %entry
234 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
235 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
236 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
237 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
238 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
239 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
240 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
241 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
242 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
243 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
244 ; GFX89-NEXT: flat_load_ushort v4, v[0:1]
245 ; GFX89-NEXT: flat_load_ushort v2, v[2:3]
246 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
247 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
248 ; GFX89-NEXT: s_waitcnt vmcnt(0)
249 ; GFX89-NEXT: v_mul_lo_u16_e32 v2, v4, v2
250 ; GFX89-NEXT: flat_store_short v[0:1], v2
251 ; GFX89-NEXT: s_endpgm
253 ; GFX9-LABEL: mul_i16:
254 ; GFX9: ; %bb.0: ; %entry
255 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
256 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
257 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
260 ; GFX9-NEXT: global_load_ushort v2, v0, s[0:1]
261 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
262 ; GFX9-NEXT: s_waitcnt vmcnt(0)
263 ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2
264 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
265 ; GFX9-NEXT: s_endpgm
267 ; GFX10-LABEL: mul_i16:
268 ; GFX10: ; %bb.0: ; %entry
269 ; GFX10-NEXT: s_clause 0x1
270 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
271 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
272 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
273 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
274 ; GFX10-NEXT: s_clause 0x1
275 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
276 ; GFX10-NEXT: global_load_ushort v2, v0, s[0:1]
277 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX10-NEXT: s_waitcnt vmcnt(0)
279 ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2
280 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
281 ; GFX10-NEXT: s_endpgm
283 %idx = call i32 @llvm.amdgcn.workitem.id.x()
284 %gepa = getelementptr i16, ptr addrspace(1) %ina, i32 %idx
285 %gepb = getelementptr i16, ptr addrspace(1) %inb, i32 %idx
286 %a = load i16, ptr addrspace(1) %gepa, align 4
287 %b = load i16, ptr addrspace(1) %gepb, align 4
288 %mul = mul i16 %a, %b
289 store i16 %mul, ptr addrspace(1) %out, align 4
293 define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
294 ; NOSDWA-LABEL: mul_v2i16:
295 ; NOSDWA: ; %bb.0: ; %entry
296 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
297 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
298 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
299 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
300 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
301 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
302 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
303 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
304 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
305 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
306 ; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
307 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
308 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
309 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
310 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
311 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v4, v2
312 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
313 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4
314 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2
315 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
316 ; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v2
317 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
318 ; NOSDWA-NEXT: s_endpgm
320 ; GFX89-LABEL: mul_v2i16:
321 ; GFX89: ; %bb.0: ; %entry
322 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
323 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
324 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
325 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
326 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
327 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
328 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
329 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
330 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
331 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
332 ; GFX89-NEXT: flat_load_dword v4, v[0:1]
333 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
334 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
335 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
336 ; GFX89-NEXT: s_waitcnt vmcnt(0)
337 ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v4, v2
338 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
339 ; GFX89-NEXT: v_or_b32_e32 v2, v3, v2
340 ; GFX89-NEXT: flat_store_dword v[0:1], v2
341 ; GFX89-NEXT: s_endpgm
343 ; GFX9-LABEL: mul_v2i16:
344 ; GFX9: ; %bb.0: ; %entry
345 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
346 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
347 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
348 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
349 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
350 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
351 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
352 ; GFX9-NEXT: s_waitcnt vmcnt(0)
353 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
354 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
355 ; GFX9-NEXT: s_endpgm
357 ; GFX10-LABEL: mul_v2i16:
358 ; GFX10: ; %bb.0: ; %entry
359 ; GFX10-NEXT: s_clause 0x1
360 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
361 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
362 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
363 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX10-NEXT: s_clause 0x1
365 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
366 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
367 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
368 ; GFX10-NEXT: s_waitcnt vmcnt(0)
369 ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v2
370 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
371 ; GFX10-NEXT: s_endpgm
373 %idx = call i32 @llvm.amdgcn.workitem.id.x()
374 %gepa = getelementptr <2 x i16>, ptr addrspace(1) %ina, i32 %idx
375 %gepb = getelementptr <2 x i16>, ptr addrspace(1) %inb, i32 %idx
376 %a = load <2 x i16>, ptr addrspace(1) %gepa, align 4
377 %b = load <2 x i16>, ptr addrspace(1) %gepb, align 4
378 %mul = mul <2 x i16> %a, %b
379 store <2 x i16> %mul, ptr addrspace(1) %out, align 4
383 define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
384 ; NOSDWA-LABEL: mul_v4i16:
385 ; NOSDWA: ; %bb.0: ; %entry
386 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
387 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
388 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
389 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
390 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
391 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
392 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
393 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
394 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
395 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
396 ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
397 ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
398 ; NOSDWA-NEXT: v_mov_b32_e32 v4, s4
399 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s5
400 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
401 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v1, v3
402 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3
403 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1
404 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v7, v0, v2
405 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
406 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0
407 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v3
408 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v2
409 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
410 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
411 ; NOSDWA-NEXT: v_or_b32_e32 v1, v6, v1
412 ; NOSDWA-NEXT: v_or_b32_e32 v0, v7, v0
413 ; NOSDWA-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
414 ; NOSDWA-NEXT: s_endpgm
416 ; GFX89-LABEL: mul_v4i16:
417 ; GFX89: ; %bb.0: ; %entry
418 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
419 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
420 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
421 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
422 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
423 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
424 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
425 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
426 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
427 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
428 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
429 ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
430 ; GFX89-NEXT: v_mov_b32_e32 v4, s4
431 ; GFX89-NEXT: v_mov_b32_e32 v5, s5
432 ; GFX89-NEXT: s_waitcnt vmcnt(0)
433 ; GFX89-NEXT: v_mul_lo_u16_e32 v6, v1, v3
434 ; GFX89-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
435 ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v0, v2
436 ; GFX89-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
437 ; GFX89-NEXT: v_or_b32_e32 v1, v6, v1
438 ; GFX89-NEXT: v_or_b32_e32 v0, v3, v0
439 ; GFX89-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
440 ; GFX89-NEXT: s_endpgm
442 ; GFX9-LABEL: mul_v4i16:
443 ; GFX9: ; %bb.0: ; %entry
444 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
445 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
446 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
447 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
448 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
449 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
450 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
451 ; GFX9-NEXT: s_waitcnt vmcnt(0)
452 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3
453 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
454 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
455 ; GFX9-NEXT: s_endpgm
457 ; GFX10-LABEL: mul_v4i16:
458 ; GFX10: ; %bb.0: ; %entry
459 ; GFX10-NEXT: s_clause 0x1
460 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
461 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
462 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
463 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX10-NEXT: s_clause 0x1
465 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
466 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
467 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
468 ; GFX10-NEXT: s_waitcnt vmcnt(0)
469 ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v3
470 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2
471 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
472 ; GFX10-NEXT: s_endpgm
474 %idx = call i32 @llvm.amdgcn.workitem.id.x()
475 %gepa = getelementptr <4 x i16>, ptr addrspace(1) %ina, i32 %idx
476 %gepb = getelementptr <4 x i16>, ptr addrspace(1) %inb, i32 %idx
477 %a = load <4 x i16>, ptr addrspace(1) %gepa, align 4
478 %b = load <4 x i16>, ptr addrspace(1) %gepb, align 4
479 %mul = mul <4 x i16> %a, %b
480 store <4 x i16> %mul, ptr addrspace(1) %out, align 4
484 define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
485 ; NOSDWA-LABEL: mul_v8i16:
486 ; NOSDWA: ; %bb.0: ; %entry
487 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
488 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
489 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0
490 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
491 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
492 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
493 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
494 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
495 ; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s0, v2
496 ; NOSDWA-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
497 ; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
498 ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
499 ; NOSDWA-NEXT: v_mov_b32_e32 v8, s4
500 ; NOSDWA-NEXT: v_mov_b32_e32 v9, s5
501 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
502 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v10, v3, v7
503 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7
504 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3
505 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v11, v2, v6
506 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v6
507 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
508 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v12, v1, v5
509 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5
510 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1
511 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v13, v0, v4
512 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4
513 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0
514 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7
515 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v6
516 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v5
517 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v4
518 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
519 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
520 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
521 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
522 ; NOSDWA-NEXT: v_or_b32_e32 v3, v10, v3
523 ; NOSDWA-NEXT: v_or_b32_e32 v2, v11, v2
524 ; NOSDWA-NEXT: v_or_b32_e32 v1, v12, v1
525 ; NOSDWA-NEXT: v_or_b32_e32 v0, v13, v0
526 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
527 ; NOSDWA-NEXT: s_endpgm
529 ; GFX89-LABEL: mul_v8i16:
530 ; GFX89: ; %bb.0: ; %entry
531 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
532 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
533 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0
534 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
536 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
537 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
538 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
539 ; GFX89-NEXT: v_add_u32_e32 v4, vcc, s0, v2
540 ; GFX89-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
541 ; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
542 ; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
543 ; GFX89-NEXT: v_mov_b32_e32 v8, s4
544 ; GFX89-NEXT: v_mov_b32_e32 v9, s5
545 ; GFX89-NEXT: s_waitcnt vmcnt(0)
546 ; GFX89-NEXT: v_mul_lo_u16_e32 v10, v3, v7
547 ; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
548 ; GFX89-NEXT: v_mul_lo_u16_e32 v7, v2, v6
549 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
550 ; GFX89-NEXT: v_mul_lo_u16_e32 v6, v1, v5
551 ; GFX89-NEXT: v_mul_lo_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
552 ; GFX89-NEXT: v_mul_lo_u16_e32 v5, v0, v4
553 ; GFX89-NEXT: v_mul_lo_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
554 ; GFX89-NEXT: v_or_b32_e32 v3, v10, v3
555 ; GFX89-NEXT: v_or_b32_e32 v2, v7, v2
556 ; GFX89-NEXT: v_or_b32_e32 v1, v6, v1
557 ; GFX89-NEXT: v_or_b32_e32 v0, v5, v0
558 ; GFX89-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
559 ; GFX89-NEXT: s_endpgm
561 ; GFX9-LABEL: mul_v8i16:
562 ; GFX9: ; %bb.0: ; %entry
563 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
564 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
565 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0
566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
568 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1]
569 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
570 ; GFX9-NEXT: s_waitcnt vmcnt(0)
571 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7
572 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6
573 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5
574 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v4
575 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
576 ; GFX9-NEXT: s_endpgm
578 ; GFX10-LABEL: mul_v8i16:
579 ; GFX10: ; %bb.0: ; %entry
580 ; GFX10-NEXT: s_clause 0x1
581 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
582 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
583 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 4, v0
584 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
585 ; GFX10-NEXT: s_clause 0x1
586 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
587 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1]
588 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
589 ; GFX10-NEXT: s_waitcnt vmcnt(0)
590 ; GFX10-NEXT: v_pk_mul_lo_u16 v3, v3, v7
591 ; GFX10-NEXT: v_pk_mul_lo_u16 v2, v2, v6
592 ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v5
593 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v4
594 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
595 ; GFX10-NEXT: s_endpgm
597 %idx = call i32 @llvm.amdgcn.workitem.id.x()
598 %gepa = getelementptr <8 x i16>, ptr addrspace(1) %ina, i32 %idx
599 %gepb = getelementptr <8 x i16>, ptr addrspace(1) %inb, i32 %idx
600 %a = load <8 x i16>, ptr addrspace(1) %gepa, align 4
601 %b = load <8 x i16>, ptr addrspace(1) %gepb, align 4
602 %mul = mul <8 x i16> %a, %b
603 store <8 x i16> %mul, ptr addrspace(1) %out, align 4
607 define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
608 ; NOSDWA-LABEL: mul_half:
609 ; NOSDWA: ; %bb.0: ; %entry
610 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
611 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
612 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
613 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
614 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
615 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
616 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
617 ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
618 ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
619 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
620 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
621 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
622 ; NOSDWA-NEXT: v_mul_f16_e32 v2, v4, v2
623 ; NOSDWA-NEXT: flat_store_short v[0:1], v2
624 ; NOSDWA-NEXT: s_endpgm
626 ; GFX89-LABEL: mul_half:
627 ; GFX89: ; %bb.0: ; %entry
628 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
629 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
630 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
631 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
632 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
633 ; GFX89-NEXT: v_mov_b32_e32 v2, s0
634 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
635 ; GFX89-NEXT: flat_load_ushort v4, v[0:1]
636 ; GFX89-NEXT: flat_load_ushort v2, v[2:3]
637 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
638 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
639 ; GFX89-NEXT: s_waitcnt vmcnt(0)
640 ; GFX89-NEXT: v_mul_f16_e32 v2, v4, v2
641 ; GFX89-NEXT: flat_store_short v[0:1], v2
642 ; GFX89-NEXT: s_endpgm
644 ; GFX9-LABEL: mul_half:
645 ; GFX9: ; %bb.0: ; %entry
646 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
647 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
648 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
651 ; GFX9-NEXT: global_load_ushort v2, v0, s[0:1]
652 ; GFX9-NEXT: s_waitcnt vmcnt(0)
653 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2
654 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
655 ; GFX9-NEXT: s_endpgm
657 ; GFX10-LABEL: mul_half:
658 ; GFX10: ; %bb.0: ; %entry
659 ; GFX10-NEXT: s_clause 0x1
660 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
661 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
662 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
663 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
664 ; GFX10-NEXT: s_clause 0x1
665 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
666 ; GFX10-NEXT: global_load_ushort v2, v0, s[0:1]
667 ; GFX10-NEXT: s_waitcnt vmcnt(0)
668 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2
669 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
670 ; GFX10-NEXT: s_endpgm
672 %a = load half, ptr addrspace(1) %ina, align 4
673 %b = load half, ptr addrspace(1) %inb, align 4
674 %mul = fmul half %a, %b
675 store half %mul, ptr addrspace(1) %out, align 4
679 define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
680 ; NOSDWA-LABEL: mul_v2half:
681 ; NOSDWA: ; %bb.0: ; %entry
682 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
683 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
684 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
685 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
686 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
687 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
688 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
689 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
690 ; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
691 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
692 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
693 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
694 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
695 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
696 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3
697 ; NOSDWA-NEXT: v_mul_f16_e32 v4, v5, v4
698 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
699 ; NOSDWA-NEXT: v_mul_f16_e32 v2, v3, v2
700 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v4
701 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
702 ; NOSDWA-NEXT: s_endpgm
704 ; GFX89-LABEL: mul_v2half:
705 ; GFX89: ; %bb.0: ; %entry
706 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
707 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
708 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
710 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
711 ; GFX89-NEXT: v_mov_b32_e32 v2, s0
712 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
713 ; GFX89-NEXT: flat_load_dword v4, v[0:1]
714 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
715 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
716 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
717 ; GFX89-NEXT: s_waitcnt vmcnt(0)
718 ; GFX89-NEXT: v_mul_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
719 ; GFX89-NEXT: v_mul_f16_e32 v2, v4, v2
720 ; GFX89-NEXT: v_or_b32_e32 v2, v2, v3
721 ; GFX89-NEXT: flat_store_dword v[0:1], v2
722 ; GFX89-NEXT: s_endpgm
724 ; GFX9-LABEL: mul_v2half:
725 ; GFX9: ; %bb.0: ; %entry
726 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
727 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
728 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
729 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
731 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
732 ; GFX9-NEXT: s_waitcnt vmcnt(0)
733 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2
734 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
735 ; GFX9-NEXT: s_endpgm
737 ; GFX10-LABEL: mul_v2half:
738 ; GFX10: ; %bb.0: ; %entry
739 ; GFX10-NEXT: s_clause 0x1
740 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
741 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
742 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
743 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
744 ; GFX10-NEXT: s_clause 0x1
745 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
746 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
747 ; GFX10-NEXT: s_waitcnt vmcnt(0)
748 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2
749 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
750 ; GFX10-NEXT: s_endpgm
752 %a = load <2 x half>, ptr addrspace(1) %ina, align 4
753 %b = load <2 x half>, ptr addrspace(1) %inb, align 4
754 %mul = fmul <2 x half> %a, %b
755 store <2 x half> %mul, ptr addrspace(1) %out, align 4
759 define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
760 ; NOSDWA-LABEL: mul_v4half:
761 ; NOSDWA: ; %bb.0: ; %entry
762 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
763 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
764 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
765 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
766 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
767 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
768 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
769 ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
770 ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
771 ; NOSDWA-NEXT: v_mov_b32_e32 v4, s4
772 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s5
773 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
774 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v3
775 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
776 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v1
777 ; NOSDWA-NEXT: v_mul_f16_e32 v1, v1, v3
778 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
779 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 16, v0
780 ; NOSDWA-NEXT: v_mul_f16_e32 v0, v0, v2
781 ; NOSDWA-NEXT: v_mul_f16_e32 v2, v7, v6
782 ; NOSDWA-NEXT: v_mul_f16_e32 v3, v8, v3
783 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
784 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
785 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v2
786 ; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v3
787 ; NOSDWA-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
788 ; NOSDWA-NEXT: s_endpgm
790 ; GFX89-LABEL: mul_v4half:
791 ; GFX89: ; %bb.0: ; %entry
792 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
793 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
794 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
795 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
796 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
797 ; GFX89-NEXT: v_mov_b32_e32 v2, s0
798 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
799 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
800 ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
801 ; GFX89-NEXT: v_mov_b32_e32 v4, s4
802 ; GFX89-NEXT: v_mov_b32_e32 v5, s5
803 ; GFX89-NEXT: s_waitcnt vmcnt(0)
804 ; GFX89-NEXT: v_mul_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
805 ; GFX89-NEXT: v_mul_f16_e32 v1, v1, v3
806 ; GFX89-NEXT: v_mul_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
807 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v2
808 ; GFX89-NEXT: v_or_b32_e32 v1, v1, v6
809 ; GFX89-NEXT: v_or_b32_e32 v0, v0, v3
810 ; GFX89-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
811 ; GFX89-NEXT: s_endpgm
813 ; GFX9-LABEL: mul_v4half:
814 ; GFX9: ; %bb.0: ; %entry
815 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
816 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
817 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
818 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
819 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
820 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
821 ; GFX9-NEXT: s_waitcnt vmcnt(0)
822 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
823 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
824 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
825 ; GFX9-NEXT: s_endpgm
827 ; GFX10-LABEL: mul_v4half:
828 ; GFX10: ; %bb.0: ; %entry
829 ; GFX10-NEXT: s_clause 0x1
830 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
831 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
832 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
833 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
834 ; GFX10-NEXT: s_clause 0x1
835 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
836 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
837 ; GFX10-NEXT: s_waitcnt vmcnt(0)
838 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
839 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
840 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
841 ; GFX10-NEXT: s_endpgm
843 %a = load <4 x half>, ptr addrspace(1) %ina, align 4
844 %b = load <4 x half>, ptr addrspace(1) %inb, align 4
845 %mul = fmul <4 x half> %a, %b
846 store <4 x half> %mul, ptr addrspace(1) %out, align 4
850 define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
851 ; NOSDWA-LABEL: mul_v8half:
852 ; NOSDWA: ; %bb.0: ; %entry
853 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
854 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
855 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
856 ; NOSDWA-NEXT: v_mov_b32_e32 v4, s6
857 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s7
858 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
859 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
860 ; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
861 ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
862 ; NOSDWA-NEXT: v_mov_b32_e32 v8, s4
863 ; NOSDWA-NEXT: v_mov_b32_e32 v9, s5
864 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
865 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v10, 16, v3
866 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
867 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v11, 16, v7
868 ; NOSDWA-NEXT: v_mul_f16_e32 v3, v7, v3
869 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v2
870 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v12, 16, v6
871 ; NOSDWA-NEXT: v_mul_f16_e32 v2, v6, v2
872 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v1
873 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5
874 ; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1
875 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
876 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4
877 ; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0
878 ; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10
879 ; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7
880 ; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6
881 ; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5
882 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
883 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7
884 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6
885 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5
886 ; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4
887 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7
888 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6
889 ; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5
890 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
891 ; NOSDWA-NEXT: s_endpgm
893 ; GFX89-LABEL: mul_v8half:
894 ; GFX89: ; %bb.0: ; %entry
895 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
896 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
897 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
898 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
899 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
900 ; GFX89-NEXT: v_mov_b32_e32 v4, s0
901 ; GFX89-NEXT: v_mov_b32_e32 v5, s1
902 ; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
903 ; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
904 ; GFX89-NEXT: v_mov_b32_e32 v8, s4
905 ; GFX89-NEXT: v_mov_b32_e32 v9, s5
906 ; GFX89-NEXT: s_waitcnt vmcnt(0)
907 ; GFX89-NEXT: v_mul_f16_sdwa v10, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
908 ; GFX89-NEXT: v_mul_f16_e32 v3, v3, v7
909 ; GFX89-NEXT: v_mul_f16_sdwa v7, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
910 ; GFX89-NEXT: v_mul_f16_e32 v2, v2, v6
911 ; GFX89-NEXT: v_mul_f16_sdwa v6, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
912 ; GFX89-NEXT: v_mul_f16_e32 v1, v1, v5
913 ; GFX89-NEXT: v_mul_f16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
914 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v4
915 ; GFX89-NEXT: v_or_b32_e32 v3, v3, v10
916 ; GFX89-NEXT: v_or_b32_e32 v2, v2, v7
917 ; GFX89-NEXT: v_or_b32_e32 v1, v1, v6
918 ; GFX89-NEXT: v_or_b32_e32 v0, v0, v5
919 ; GFX89-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
920 ; GFX89-NEXT: s_endpgm
922 ; GFX9-LABEL: mul_v8half:
923 ; GFX9: ; %bb.0: ; %entry
924 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
925 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
926 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
927 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
928 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
929 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1]
930 ; GFX9-NEXT: s_waitcnt vmcnt(0)
931 ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7
932 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6
933 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
934 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
935 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
936 ; GFX9-NEXT: s_endpgm
938 ; GFX10-LABEL: mul_v8half:
939 ; GFX10: ; %bb.0: ; %entry
940 ; GFX10-NEXT: s_clause 0x1
941 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
942 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
943 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
944 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
945 ; GFX10-NEXT: s_clause 0x1
946 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
947 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1]
948 ; GFX10-NEXT: s_waitcnt vmcnt(0)
949 ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7
950 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6
951 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5
952 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4
953 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
954 ; GFX10-NEXT: s_endpgm
956 %a = load <8 x half>, ptr addrspace(1) %ina, align 4
957 %b = load <8 x half>, ptr addrspace(1) %inb, align 4
958 %mul = fmul <8 x half> %a, %b
959 store <8 x half> %mul, ptr addrspace(1) %out, align 4
963 define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
964 ; NOSDWA-LABEL: mul_i8:
965 ; NOSDWA: ; %bb.0: ; %entry
966 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
967 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
968 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
969 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s7
970 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0
971 ; NOSDWA-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
972 ; NOSDWA-NEXT: v_mov_b32_e32 v4, s1
973 ; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s0, v0
974 ; NOSDWA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
975 ; NOSDWA-NEXT: flat_load_ubyte v2, v[1:2]
976 ; NOSDWA-NEXT: flat_load_ubyte v3, v[3:4]
977 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
978 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
979 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
980 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v3
981 ; NOSDWA-NEXT: flat_store_byte v[0:1], v2
982 ; NOSDWA-NEXT: s_endpgm
984 ; GFX89-LABEL: mul_i8:
985 ; GFX89: ; %bb.0: ; %entry
986 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
987 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
988 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
989 ; GFX89-NEXT: v_mov_b32_e32 v2, s7
990 ; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0
991 ; GFX89-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
992 ; GFX89-NEXT: v_mov_b32_e32 v4, s1
993 ; GFX89-NEXT: v_add_u32_e32 v3, vcc, s0, v0
994 ; GFX89-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
995 ; GFX89-NEXT: flat_load_ubyte v2, v[1:2]
996 ; GFX89-NEXT: flat_load_ubyte v3, v[3:4]
997 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
998 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
999 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1000 ; GFX89-NEXT: v_mul_lo_u16_e32 v2, v2, v3
1001 ; GFX89-NEXT: flat_store_byte v[0:1], v2
1002 ; GFX89-NEXT: s_endpgm
1004 ; GFX9-LABEL: mul_i8:
1005 ; GFX9: ; %bb.0: ; %entry
1006 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1007 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1008 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1009 ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
1010 ; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1]
1011 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1012 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1013 ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2
1014 ; GFX9-NEXT: global_store_byte v0, v1, s[4:5]
1015 ; GFX9-NEXT: s_endpgm
1017 ; GFX10-LABEL: mul_i8:
1018 ; GFX10: ; %bb.0: ; %entry
1019 ; GFX10-NEXT: s_clause 0x1
1020 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1021 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1022 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1023 ; GFX10-NEXT: s_clause 0x1
1024 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
1025 ; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1]
1026 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1027 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1028 ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2
1029 ; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
1030 ; GFX10-NEXT: s_endpgm
1032 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1033 %gepa = getelementptr i8, ptr addrspace(1) %ina, i32 %idx
1034 %gepb = getelementptr i8, ptr addrspace(1) %inb, i32 %idx
1035 %a = load i8, ptr addrspace(1) %gepa, align 4
1036 %b = load i8, ptr addrspace(1) %gepb, align 4
1037 %mul = mul i8 %a, %b
1038 store i8 %mul, ptr addrspace(1) %out, align 4
1042 define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
1043 ; NOSDWA-LABEL: mul_v2i8:
1044 ; NOSDWA: ; %bb.0: ; %entry
1045 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1046 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1047 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
1048 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1049 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
1050 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1051 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1052 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
1053 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1054 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1055 ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
1056 ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
1057 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
1058 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
1059 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
1060 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v3, 8, v4
1061 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1062 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v5, 8, v2
1063 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2
1064 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v5
1065 ; NOSDWA-NEXT: v_and_b32_e32 v2, 0xff, v2
1066 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v3, 8, v3
1067 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3
1068 ; NOSDWA-NEXT: flat_store_short v[0:1], v2
1069 ; NOSDWA-NEXT: s_endpgm
1071 ; GFX89-LABEL: mul_v2i8:
1072 ; GFX89: ; %bb.0: ; %entry
1073 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1074 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1075 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
1076 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1077 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
1078 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1079 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1080 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
1081 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1082 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1083 ; GFX89-NEXT: flat_load_ushort v4, v[0:1]
1084 ; GFX89-NEXT: flat_load_ushort v2, v[2:3]
1085 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
1086 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
1087 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1088 ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v4, v2
1089 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1090 ; GFX89-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1091 ; GFX89-NEXT: flat_store_short v[0:1], v2
1092 ; GFX89-NEXT: s_endpgm
1094 ; GFX9-LABEL: mul_v2i8:
1095 ; GFX9: ; %bb.0: ; %entry
1096 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1097 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1098 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1099 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1100 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
1101 ; GFX9-NEXT: global_load_ushort v2, v0, s[0:1]
1102 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1104 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2
1105 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1106 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1107 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
1108 ; GFX9-NEXT: s_endpgm
1110 ; GFX10-LABEL: mul_v2i8:
1111 ; GFX10: ; %bb.0: ; %entry
1112 ; GFX10-NEXT: s_clause 0x1
1113 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1114 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1115 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1116 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1117 ; GFX10-NEXT: s_clause 0x1
1118 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
1119 ; GFX10-NEXT: global_load_ushort v2, v0, s[0:1]
1120 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1121 ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v1
1122 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1123 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2
1124 ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2
1125 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1126 ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v3
1127 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
1128 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1129 ; GFX10-NEXT: global_store_short v2, v0, s[4:5]
1130 ; GFX10-NEXT: s_endpgm
1132 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1133 %gepa = getelementptr <2 x i8>, ptr addrspace(1) %ina, i32 %idx
1134 %gepb = getelementptr <2 x i8>, ptr addrspace(1) %inb, i32 %idx
1135 %a = load <2 x i8>, ptr addrspace(1) %gepa, align 4
1136 %b = load <2 x i8>, ptr addrspace(1) %gepb, align 4
1137 %mul = mul <2 x i8> %a, %b
1138 store <2 x i8> %mul, ptr addrspace(1) %out, align 4
1142 define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
1143 ; NOSDWA-LABEL: mul_v4i8:
1144 ; NOSDWA: ; %bb.0: ; %entry
1145 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1146 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1147 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1148 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1149 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
1150 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1151 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1152 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
1153 ; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
1154 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1155 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1156 ; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
1157 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
1158 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
1159 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
1160 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4
1161 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 24, v4
1162 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v6, 8, v4
1163 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1164 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v2
1165 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 24, v2
1166 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v9, 8, v2
1167 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2
1168 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v4, v6, v9
1169 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v5, v5, v8
1170 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7
1171 ; NOSDWA-NEXT: v_and_b32_e32 v2, 0xff, v2
1172 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v4, 8, v4
1173 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v5, 8, v5
1174 ; NOSDWA-NEXT: v_and_b32_e32 v3, 0xff, v3
1175 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v4
1176 ; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5
1177 ; NOSDWA-NEXT: v_and_b32_e32 v2, 0xffff, v2
1178 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1179 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3
1180 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
1181 ; NOSDWA-NEXT: s_endpgm
1183 ; GFX89-LABEL: mul_v4i8:
1184 ; GFX89: ; %bb.0: ; %entry
1185 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1186 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1187 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1188 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1189 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
1190 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1191 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1192 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
1193 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1194 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1195 ; GFX89-NEXT: flat_load_dword v4, v[0:1]
1196 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
1197 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
1198 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
1199 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1200 ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v4, v2
1201 ; GFX89-NEXT: v_mul_lo_u16_sdwa v5, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1202 ; GFX89-NEXT: v_mul_lo_u16_sdwa v6, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1203 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1204 ; GFX89-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1205 ; GFX89-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1206 ; GFX89-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1207 ; GFX89-NEXT: flat_store_dword v[0:1], v2
1208 ; GFX89-NEXT: s_endpgm
1210 ; GFX9-LABEL: mul_v4i8:
1211 ; GFX9: ; %bb.0: ; %entry
1212 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1213 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1214 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1215 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1216 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1217 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
1218 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1219 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1220 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2
1221 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1222 ; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1223 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1224 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1225 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1226 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1227 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1228 ; GFX9-NEXT: s_endpgm
1230 ; GFX10-LABEL: mul_v4i8:
1231 ; GFX10: ; %bb.0: ; %entry
1232 ; GFX10-NEXT: s_clause 0x1
1233 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1234 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1235 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1236 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX10-NEXT: s_clause 0x1
1238 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1239 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
1240 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1241 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
1242 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1
1243 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1244 ; GFX10-NEXT: v_lshrrev_b16 v4, 8, v2
1245 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2
1246 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1247 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
1248 ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2
1249 ; GFX10-NEXT: v_mul_lo_u16 v3, v3, v4
1250 ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v5
1251 ; GFX10-NEXT: v_mul_lo_u16 v2, v6, v7
1252 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
1253 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
1254 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1255 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1256 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1257 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1258 ; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
1259 ; GFX10-NEXT: s_endpgm
1261 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1262 %gepa = getelementptr <4 x i8>, ptr addrspace(1) %ina, i32 %idx
1263 %gepb = getelementptr <4 x i8>, ptr addrspace(1) %inb, i32 %idx
1264 %a = load <4 x i8>, ptr addrspace(1) %gepa, align 4
1265 %b = load <4 x i8>, ptr addrspace(1) %gepb, align 4
1266 %mul = mul <4 x i8> %a, %b
1267 store <4 x i8> %mul, ptr addrspace(1) %out, align 4
1271 define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
1272 ; NOSDWA-LABEL: mul_v8i8:
1273 ; NOSDWA: ; %bb.0: ; %entry
1274 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1275 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1276 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1277 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1278 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
1279 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1280 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1281 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
1282 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1283 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1284 ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1285 ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1286 ; NOSDWA-NEXT: v_mov_b32_e32 v4, s4
1287 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s5
1288 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
1289 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v0
1290 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 24, v0
1291 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v8, 8, v0
1292 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 16, v1
1293 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v10, 24, v1
1294 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v11, 8, v1
1295 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1296 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v12, 16, v2
1297 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 24, v2
1298 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v14, 8, v2
1299 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1300 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v16, 24, v3
1301 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v17, 8, v3
1302 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v3
1303 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v2
1304 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v11, v17
1305 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v10, v16
1306 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v9, v9, v15
1307 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v8, v8, v14
1308 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v7, v7, v13
1309 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v6, v12
1310 ; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1
1311 ; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0
1312 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v2, 8, v2
1313 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v3, 8, v3
1314 ; NOSDWA-NEXT: v_and_b32_e32 v9, 0xff, v9
1315 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8
1316 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v7, 8, v7
1317 ; NOSDWA-NEXT: v_and_b32_e32 v6, 0xff, v6
1318 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v2
1319 ; NOSDWA-NEXT: v_or_b32_e32 v2, v9, v3
1320 ; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v8
1321 ; NOSDWA-NEXT: v_or_b32_e32 v3, v6, v7
1322 ; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1
1323 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1324 ; NOSDWA-NEXT: v_and_b32_e32 v0, 0xffff, v0
1325 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1326 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v2
1327 ; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v3
1328 ; NOSDWA-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
1329 ; NOSDWA-NEXT: s_endpgm
1331 ; GFX89-LABEL: mul_v8i8:
1332 ; GFX89: ; %bb.0: ; %entry
1333 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1334 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1335 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1336 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1337 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
1338 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1339 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1340 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
1341 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1342 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1343 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1344 ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1345 ; GFX89-NEXT: v_mov_b32_e32 v4, s4
1346 ; GFX89-NEXT: v_mov_b32_e32 v5, s5
1347 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1348 ; GFX89-NEXT: v_mul_lo_u16_e32 v6, v1, v3
1349 ; GFX89-NEXT: v_mul_lo_u16_sdwa v7, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1350 ; GFX89-NEXT: v_mul_lo_u16_sdwa v8, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1351 ; GFX89-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1352 ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v0, v2
1353 ; GFX89-NEXT: v_mul_lo_u16_sdwa v9, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1354 ; GFX89-NEXT: v_mul_lo_u16_sdwa v10, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1355 ; GFX89-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1356 ; GFX89-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1357 ; GFX89-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1358 ; GFX89-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1359 ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1360 ; GFX89-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1361 ; GFX89-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1362 ; GFX89-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
1363 ; GFX89-NEXT: s_endpgm
1365 ; GFX9-LABEL: mul_v8i8:
1366 ; GFX9: ; %bb.0: ; %entry
1367 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1368 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1369 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1370 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1371 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
1372 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
1373 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1374 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1375 ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v1, v3
1376 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1377 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1378 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1379 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v0, v2
1380 ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1381 ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1382 ; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1383 ; GFX9-NEXT: v_or_b32_sdwa v2, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1384 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1385 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1386 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1387 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1388 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1389 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
1390 ; GFX9-NEXT: s_endpgm
1392 ; GFX10-LABEL: mul_v8i8:
1393 ; GFX10: ; %bb.0: ; %entry
1394 ; GFX10-NEXT: s_clause 0x1
1395 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1396 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1397 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1398 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1399 ; GFX10-NEXT: s_clause 0x1
1400 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
1401 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
1402 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1403 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
1404 ; GFX10-NEXT: v_lshrrev_b16 v6, 8, v0
1405 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1
1406 ; GFX10-NEXT: v_lshrrev_b16 v9, 8, v1
1407 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1408 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2
1409 ; GFX10-NEXT: v_lshrrev_b16 v12, 8, v2
1410 ; GFX10-NEXT: v_lshrrev_b16 v13, 8, v3
1411 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v3
1412 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
1413 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1414 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
1415 ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1416 ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v3
1417 ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v2
1418 ; GFX10-NEXT: v_mul_lo_u16 v2, v9, v13
1419 ; GFX10-NEXT: v_mul_lo_u16 v3, v8, v14
1420 ; GFX10-NEXT: v_mul_lo_u16 v6, v6, v12
1421 ; GFX10-NEXT: v_mul_lo_u16 v5, v5, v11
1422 ; GFX10-NEXT: v_mul_lo_u16 v7, v7, v15
1423 ; GFX10-NEXT: v_mul_lo_u16 v4, v4, v10
1424 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
1425 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
1426 ; GFX10-NEXT: v_lshlrev_b16 v6, 8, v6
1427 ; GFX10-NEXT: v_lshlrev_b16 v5, 8, v5
1428 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1429 ; GFX10-NEXT: v_or_b32_sdwa v2, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1430 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1431 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1432 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1433 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1434 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1435 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
1436 ; GFX10-NEXT: s_endpgm
1438 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1439 %gepa = getelementptr <8 x i8>, ptr addrspace(1) %ina, i32 %idx
1440 %gepb = getelementptr <8 x i8>, ptr addrspace(1) %inb, i32 %idx
1441 %a = load <8 x i8>, ptr addrspace(1) %gepa, align 4
1442 %b = load <8 x i8>, ptr addrspace(1) %gepb, align 4
1443 %mul = mul <8 x i8> %a, %b
1444 store <8 x i8> %mul, ptr addrspace(1) %out, align 4
1448 ; FIXME: Should be able to avoid or
1449 define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
1450 ; NOSDWA-LABEL: sitofp_v2i16_to_v2f16:
1451 ; NOSDWA: ; %bb.0: ; %entry
1452 ; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1453 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1454 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
1455 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
1456 ; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
1457 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
1458 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
1459 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1460 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
1461 ; NOSDWA-NEXT: v_cvt_f16_i16_e32 v3, v3
1462 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1463 ; NOSDWA-NEXT: v_cvt_f16_i16_e32 v2, v2
1464 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3
1465 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
1466 ; NOSDWA-NEXT: s_endpgm
1468 ; GFX89-LABEL: sitofp_v2i16_to_v2f16:
1469 ; GFX89: ; %bb.0: ; %entry
1470 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1471 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1472 ; GFX89-NEXT: v_mov_b32_e32 v0, s2
1473 ; GFX89-NEXT: v_mov_b32_e32 v1, s3
1474 ; GFX89-NEXT: flat_load_dword v2, v[0:1]
1475 ; GFX89-NEXT: v_mov_b32_e32 v0, s0
1476 ; GFX89-NEXT: v_mov_b32_e32 v1, s1
1477 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1478 ; GFX89-NEXT: v_cvt_f16_i16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1479 ; GFX89-NEXT: v_cvt_f16_i16_e32 v2, v2
1480 ; GFX89-NEXT: v_or_b32_e32 v2, v2, v3
1481 ; GFX89-NEXT: flat_store_dword v[0:1], v2
1482 ; GFX89-NEXT: s_endpgm
1484 ; GFX9-LABEL: sitofp_v2i16_to_v2f16:
1485 ; GFX9: ; %bb.0: ; %entry
1486 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1487 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1488 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1489 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1490 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1491 ; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1
1492 ; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1493 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1494 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1495 ; GFX9-NEXT: s_endpgm
1497 ; GFX10-LABEL: sitofp_v2i16_to_v2f16:
1498 ; GFX10: ; %bb.0: ; %entry
1499 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1500 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1501 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1502 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1503 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1504 ; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1
1505 ; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1506 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
1507 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1508 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1509 ; GFX10-NEXT: s_endpgm
1510 ptr addrspace(1) %r,
1511 ptr addrspace(1) %a) #0 {
1513 %a.val = load <2 x i16>, ptr addrspace(1) %a
1514 %r.val = sitofp <2 x i16> %a.val to <2 x half>
1515 store <2 x half> %r.val, ptr addrspace(1) %r
1519 define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
1520 ; NOSDWA-LABEL: mac_v2half:
1521 ; NOSDWA: ; %bb.0: ; %entry
1522 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1523 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1524 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1525 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
1526 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
1527 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
1528 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
1529 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
1530 ; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
1531 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
1532 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
1533 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
1534 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1535 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1536 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3
1537 ; NOSDWA-NEXT: v_mac_f16_e32 v4, v5, v4
1538 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1539 ; NOSDWA-NEXT: v_mac_f16_e32 v2, v3, v2
1540 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v4
1541 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
1542 ; NOSDWA-NEXT: s_endpgm
1544 ; GFX89-LABEL: mac_v2half:
1545 ; GFX89: ; %bb.0: ; %entry
1546 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1547 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1548 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1549 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
1550 ; GFX89-NEXT: v_mov_b32_e32 v2, s0
1551 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
1552 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
1553 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
1554 ; GFX89-NEXT: flat_load_dword v3, v[0:1]
1555 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
1556 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
1557 ; GFX89-NEXT: s_waitcnt vmcnt(1)
1558 ; GFX89-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1559 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1560 ; GFX89-NEXT: v_mac_f16_sdwa v4, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1561 ; GFX89-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1562 ; GFX89-NEXT: v_mac_f16_e32 v2, v3, v2
1563 ; GFX89-NEXT: v_or_b32_e32 v2, v2, v4
1564 ; GFX89-NEXT: flat_store_dword v[0:1], v2
1565 ; GFX89-NEXT: s_endpgm
1567 ; GFX9-LABEL: mac_v2half:
1568 ; GFX9: ; %bb.0: ; %entry
1569 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1570 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1571 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1572 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1573 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1574 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
1575 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1576 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2
1577 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2
1578 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1579 ; GFX9-NEXT: s_endpgm
1581 ; GFX10-LABEL: mac_v2half:
1582 ; GFX10: ; %bb.0: ; %entry
1583 ; GFX10-NEXT: s_clause 0x1
1584 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1585 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1586 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1587 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1588 ; GFX10-NEXT: s_clause 0x1
1589 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1590 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
1591 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1592 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2
1593 ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2
1594 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1595 ; GFX10-NEXT: s_endpgm
1597 %a = load <2 x half>, ptr addrspace(1) %ina, align 4
1598 %b = load <2 x half>, ptr addrspace(1) %inb, align 4
1599 %mul = fmul <2 x half> %a, %b
1600 %mac = fadd <2 x half> %mul, %b
1601 store <2 x half> %mac, ptr addrspace(1) %out, align 4
1605 define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1606 ; NOSDWA-LABEL: immediate_mul_v2i16:
1607 ; NOSDWA: ; %bb.0: ; %entry
1608 ; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1609 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1610 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1611 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
1612 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1613 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1614 ; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
1615 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
1616 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
1617 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1618 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, 0x7b, v2
1619 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1620 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, 0x141, v2
1621 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1622 ; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v2
1623 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
1624 ; NOSDWA-NEXT: s_endpgm
1626 ; GFX89-LABEL: immediate_mul_v2i16:
1627 ; GFX89: ; %bb.0: ; %entry
1628 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1629 ; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1630 ; GFX89-NEXT: v_mov_b32_e32 v3, 0x141
1631 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1632 ; GFX89-NEXT: v_mov_b32_e32 v1, s3
1633 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1634 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1635 ; GFX89-NEXT: flat_load_dword v2, v[0:1]
1636 ; GFX89-NEXT: v_mov_b32_e32 v0, s0
1637 ; GFX89-NEXT: v_mov_b32_e32 v1, s1
1638 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1639 ; GFX89-NEXT: v_mul_lo_u16_e32 v4, 0x7b, v2
1640 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1641 ; GFX89-NEXT: v_or_b32_e32 v2, v4, v2
1642 ; GFX89-NEXT: flat_store_dword v[0:1], v2
1643 ; GFX89-NEXT: s_endpgm
1645 ; GFX9-LABEL: immediate_mul_v2i16:
1646 ; GFX9: ; %bb.0: ; %entry
1647 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1648 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1649 ; GFX9-NEXT: s_mov_b32 s0, 0x141007b
1650 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1651 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1652 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
1653 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1654 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s0
1655 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
1656 ; GFX9-NEXT: s_endpgm
1658 ; GFX10-LABEL: immediate_mul_v2i16:
1659 ; GFX10: ; %bb.0: ; %entry
1660 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1661 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1662 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1663 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1664 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
1665 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1666 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0
1667 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
1668 ; GFX10-NEXT: s_endpgm
1670 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1671 %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %idx
1672 %a = load <2 x i16>, ptr addrspace(1) %gep, align 4
1673 %mul = mul <2 x i16> %a, <i16 123, i16 321>
1674 store <2 x i16> %mul, ptr addrspace(1) %out, align 4
1678 ; Double use of same src - should not convert it
1679 define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
1680 ; NOSDWA-LABEL: mulmul_v2i16:
1681 ; NOSDWA: ; %bb.0: ; %entry
1682 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1683 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1684 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1685 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1686 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
1687 ; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1688 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1689 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
1690 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1691 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1692 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
1693 ; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
1694 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
1695 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
1696 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
1697 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1698 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1699 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3
1700 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v2
1701 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v5, v5, v4
1702 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v3, v2
1703 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v5, v4
1704 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1705 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3
1706 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
1707 ; NOSDWA-NEXT: s_endpgm
1709 ; GFX89-LABEL: mulmul_v2i16:
1710 ; GFX89: ; %bb.0: ; %entry
1711 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1712 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1713 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1714 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1715 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
1716 ; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1717 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1718 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
1719 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1720 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1721 ; GFX89-NEXT: flat_load_dword v4, v[0:1]
1722 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
1723 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
1724 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
1725 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1726 ; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1727 ; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2
1728 ; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2
1729 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1730 ; GFX89-NEXT: v_or_b32_e32 v2, v4, v2
1731 ; GFX89-NEXT: flat_store_dword v[0:1], v2
1732 ; GFX89-NEXT: s_endpgm
1734 ; GFX9-LABEL: mulmul_v2i16:
1735 ; GFX9: ; %bb.0: ; %entry
1736 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1737 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1738 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1739 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1740 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1741 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
1742 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1743 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1744 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1745 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
1746 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1747 ; GFX9-NEXT: s_endpgm
1749 ; GFX10-LABEL: mulmul_v2i16:
1750 ; GFX10: ; %bb.0: ; %entry
1751 ; GFX10-NEXT: s_clause 0x1
1752 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1753 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1754 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1755 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1756 ; GFX10-NEXT: s_clause 0x1
1757 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1758 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
1759 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1760 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v2
1761 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1762 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2
1763 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
1764 ; GFX10-NEXT: s_endpgm
1766 %idx = call i32 @llvm.amdgcn.workitem.id.x()
1767 %gepa = getelementptr <2 x i16>, ptr addrspace(1) %ina, i32 %idx
1768 %gepb = getelementptr <2 x i16>, ptr addrspace(1) %inb, i32 %idx
1769 %a = load <2 x i16>, ptr addrspace(1) %gepa, align 4
1770 %b = load <2 x i16>, ptr addrspace(1) %gepb, align 4
1771 %mul = mul <2 x i16> %a, %b
1772 %mul2 = mul <2 x i16> %mul, %b
1773 store <2 x i16> %mul2, ptr addrspace(1) %out, align 4
1777 define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
1778 ; NOSDWA-LABEL: add_bb_v2i16:
1779 ; NOSDWA: ; %bb.0: ; %entry
1780 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1781 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1782 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1783 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
1784 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
1785 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
1786 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
1787 ; NOSDWA-NEXT: flat_load_dword v1, v[0:1]
1788 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
1789 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
1790 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
1791 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1792 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1793 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1794 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, v1, v2
1795 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v4
1796 ; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1
1797 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1798 ; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2
1799 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
1800 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
1801 ; NOSDWA-NEXT: s_endpgm
1803 ; GFX89-LABEL: add_bb_v2i16:
1804 ; GFX89: ; %bb.0: ; %entry
1805 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1806 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1807 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1808 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
1809 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
1810 ; GFX89-NEXT: v_mov_b32_e32 v2, s0
1811 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
1812 ; GFX89-NEXT: flat_load_dword v1, v[0:1]
1813 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
1814 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
1815 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1816 ; GFX89-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1817 ; GFX89-NEXT: v_add_u32_e32 v1, vcc, v1, v2
1818 ; GFX89-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1819 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
1820 ; GFX89-NEXT: flat_store_dword v[0:1], v2
1821 ; GFX89-NEXT: s_endpgm
1823 ; GFX9-LABEL: add_bb_v2i16:
1824 ; GFX9: ; %bb.0: ; %entry
1825 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1826 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1827 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1828 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1829 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1830 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
1831 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1832 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
1833 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1834 ; GFX9-NEXT: s_endpgm
1836 ; GFX10-LABEL: add_bb_v2i16:
1837 ; GFX10: ; %bb.0: ; %entry
1838 ; GFX10-NEXT: s_clause 0x1
1839 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1840 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1841 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1842 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1843 ; GFX10-NEXT: s_clause 0x1
1844 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1845 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
1846 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1847 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2
1848 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1849 ; GFX10-NEXT: s_endpgm
1851 %a = load <2 x i16>, ptr addrspace(1) %ina, align 4
1852 %b = load <2 x i16>, ptr addrspace(1) %inb, align 4
1855 %add = add <2 x i16> %a, %b
1856 br label %store_label
1858 store <2 x i16> %add, ptr addrspace(1) %out, align 4
1862 ; Check that "pulling out" SDWA operands works correctly.
1863 define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 {
1864 ; NOSDWA-LABEL: pulled_out_test:
1865 ; NOSDWA: ; %bb.0: ; %entry
1866 ; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1867 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
1868 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
1869 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
1870 ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1871 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
1872 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
1873 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1874 ; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0
1875 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0
1876 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 24, v0
1877 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1878 ; NOSDWA-NEXT: v_and_b32_e32 v7, 0xff, v1
1879 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 8, v1
1880 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 24, v1
1881 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1882 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v5, 8, v5
1883 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v6, 8, v6
1884 ; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0
1885 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8
1886 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v9, 8, v9
1887 ; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1
1888 ; NOSDWA-NEXT: v_or_b32_e32 v4, v4, v5
1889 ; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v6
1890 ; NOSDWA-NEXT: v_or_b32_e32 v5, v7, v8
1891 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v9
1892 ; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v4
1893 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1894 ; NOSDWA-NEXT: v_and_b32_e32 v5, 0xffff, v5
1895 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1896 ; NOSDWA-NEXT: v_or_b32_e32 v0, v4, v0
1897 ; NOSDWA-NEXT: v_or_b32_e32 v1, v5, v1
1898 ; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1899 ; NOSDWA-NEXT: s_endpgm
1901 ; GFX89-LABEL: pulled_out_test:
1902 ; GFX89: ; %bb.0: ; %entry
1903 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1904 ; GFX89-NEXT: v_mov_b32_e32 v4, 8
1905 ; GFX89-NEXT: v_mov_b32_e32 v5, 0xff
1906 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1907 ; GFX89-NEXT: v_mov_b32_e32 v0, s0
1908 ; GFX89-NEXT: v_mov_b32_e32 v1, s1
1909 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1910 ; GFX89-NEXT: v_mov_b32_e32 v2, s2
1911 ; GFX89-NEXT: v_mov_b32_e32 v3, s3
1912 ; GFX89-NEXT: s_waitcnt vmcnt(0)
1913 ; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1914 ; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0
1915 ; GFX89-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1916 ; GFX89-NEXT: v_lshrrev_b32_e32 v9, 24, v1
1917 ; GFX89-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1918 ; GFX89-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1919 ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1920 ; GFX89-NEXT: v_lshlrev_b16_e32 v6, 8, v7
1921 ; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1922 ; GFX89-NEXT: v_lshlrev_b16_e32 v4, 8, v9
1923 ; GFX89-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1924 ; GFX89-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1925 ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1926 ; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1927 ; GFX89-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1928 ; GFX89-NEXT: s_endpgm
1930 ; GFX9-LABEL: pulled_out_test:
1931 ; GFX9: ; %bb.0: ; %entry
1932 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1933 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1934 ; GFX9-NEXT: v_mov_b32_e32 v3, 8
1935 ; GFX9-NEXT: s_movk_i32 s0, 0xff
1936 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1937 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
1938 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1939 ; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1940 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0
1941 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1942 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1
1943 ; GFX9-NEXT: v_and_b32_sdwa v6, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1944 ; GFX9-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1945 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1946 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v5
1947 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1948 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7
1949 ; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1950 ; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1951 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1952 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1953 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1954 ; GFX9-NEXT: s_endpgm
1956 ; GFX10-LABEL: pulled_out_test:
1957 ; GFX10: ; %bb.0: ; %entry
1958 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1959 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1960 ; GFX10-NEXT: v_mov_b32_e32 v3, 8
1961 ; GFX10-NEXT: v_mov_b32_e32 v4, 24
1962 ; GFX10-NEXT: v_mov_b32_e32 v5, 0xff
1963 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1964 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
1965 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1966 ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1967 ; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1968 ; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1969 ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1970 ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1971 ; GFX10-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1972 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1973 ; GFX10-NEXT: v_or_b32_sdwa v6, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1974 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1975 ; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1976 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1977 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1978 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1979 ; GFX10-NEXT: s_endpgm
1981 %idxprom = ashr exact i64 15, 32
1982 %arrayidx = getelementptr inbounds <8 x i8>, ptr addrspace(1) %sourceA, i64 %idxprom
1983 %tmp = load <8 x i8>, ptr addrspace(1) %arrayidx, align 8
1985 %tmp1 = extractelement <8 x i8> %tmp, i32 0
1986 %tmp2 = extractelement <8 x i8> %tmp, i32 1
1987 %tmp3 = extractelement <8 x i8> %tmp, i32 2
1988 %tmp4 = extractelement <8 x i8> %tmp, i32 3
1989 %tmp5 = extractelement <8 x i8> %tmp, i32 4
1990 %tmp6 = extractelement <8 x i8> %tmp, i32 5
1991 %tmp7 = extractelement <8 x i8> %tmp, i32 6
1992 %tmp8 = extractelement <8 x i8> %tmp, i32 7
1994 %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
1995 %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
1996 %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
1997 %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
1998 %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
1999 %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
2000 %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
2001 %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1
2003 %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2004 %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2005 %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2007 %arrayidx5 = getelementptr inbounds <8 x i8>, ptr addrspace(1) %destValues, i64 %idxprom
2008 store <8 x i8> %tmp19, ptr addrspace(1) %arrayidx5, align 8
2012 ; TODO: Why is the constant not peepholed into the v_or_b32_e32?
2013 define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
2014 ; NOSDWA-LABEL: sdwa_crash_inlineasm_def:
2015 ; NOSDWA: ; %bb.0: ; %bb
2016 ; NOSDWA-NEXT: s_mov_b32 s0, 0xffff
2017 ; NOSDWA-NEXT: ;;#ASMSTART
2018 ; NOSDWA-NEXT: v_and_b32_e32 v0, s0, v0
2019 ; NOSDWA-NEXT: ;;#ASMEND
2020 ; NOSDWA-NEXT: v_or_b32_e32 v0, 0x10000, v0
2021 ; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
2022 ; NOSDWA-NEXT: .LBB21_1: ; %bb1
2023 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
2024 ; NOSDWA-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
2025 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
2026 ; NOSDWA-NEXT: s_mov_b64 vcc, vcc
2027 ; NOSDWA-NEXT: s_cbranch_vccnz .LBB21_1
2028 ; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock
2029 ; NOSDWA-NEXT: s_endpgm
2031 ; GFX89-LABEL: sdwa_crash_inlineasm_def:
2032 ; GFX89: ; %bb.0: ; %bb
2033 ; GFX89-NEXT: s_mov_b32 s0, 0xffff
2034 ; GFX89-NEXT: ;;#ASMSTART
2035 ; GFX89-NEXT: v_and_b32_e32 v0, s0, v0
2036 ; GFX89-NEXT: ;;#ASMEND
2037 ; GFX89-NEXT: v_or_b32_e32 v0, 0x10000, v0
2038 ; GFX89-NEXT: s_and_b64 vcc, exec, -1
2039 ; GFX89-NEXT: .LBB21_1: ; %bb1
2040 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
2041 ; GFX89-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
2042 ; GFX89-NEXT: s_waitcnt vmcnt(0)
2043 ; GFX89-NEXT: s_mov_b64 vcc, vcc
2044 ; GFX89-NEXT: s_cbranch_vccnz .LBB21_1
2045 ; GFX89-NEXT: ; %bb.2: ; %DummyReturnBlock
2046 ; GFX89-NEXT: s_endpgm
2048 ; GFX9-LABEL: sdwa_crash_inlineasm_def:
2049 ; GFX9: ; %bb.0: ; %bb
2050 ; GFX9-NEXT: s_mov_b32 s0, 0xffff
2051 ; GFX9-NEXT: ;;#ASMSTART
2052 ; GFX9-NEXT: v_and_b32_e32 v0, s0, v0
2053 ; GFX9-NEXT: ;;#ASMEND
2054 ; GFX9-NEXT: v_or_b32_e32 v0, 0x10000, v0
2055 ; GFX9-NEXT: s_and_b64 vcc, exec, -1
2056 ; GFX9-NEXT: .LBB21_1: ; %bb1
2057 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2058 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
2059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2060 ; GFX9-NEXT: s_mov_b64 vcc, vcc
2061 ; GFX9-NEXT: s_cbranch_vccnz .LBB21_1
2062 ; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock
2063 ; GFX9-NEXT: s_endpgm
2065 ; GFX10-LABEL: sdwa_crash_inlineasm_def:
2066 ; GFX10: ; %bb.0: ; %bb
2067 ; GFX10-NEXT: s_mov_b32 s0, 0xffff
2068 ; GFX10-NEXT: ;;#ASMSTART
2069 ; GFX10-NEXT: v_and_b32_e32 v0, s0, v0
2070 ; GFX10-NEXT: ;;#ASMEND
2071 ; GFX10-NEXT: v_or_b32_e32 v0, 0x10000, v0
2072 ; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
2073 ; GFX10-NEXT: .LBB21_1: ; %bb1
2074 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2075 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
2076 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2077 ; GFX10-NEXT: s_cbranch_vccnz .LBB21_1
2078 ; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock
2079 ; GFX10-NEXT: s_endpgm
2083 bb1: ; preds = %bb11, %bb
2084 %tmp = phi <2 x i32> [ %tmp12, %bb11 ], [ undef, %bb ]
2085 br i1 true, label %bb2, label %bb11
2088 %tmp3 = call i32 asm "v_and_b32_e32 $0, $1, $2", "=v,s,v"(i32 65535, i32 undef) #1
2089 %tmp5 = or i32 %tmp3, 65536
2090 %tmp6 = insertelement <2 x i32> %tmp, i32 %tmp5, i64 0
2093 bb11: ; preds = %bb10, %bb2
2094 %tmp12 = phi <2 x i32> [ %tmp6, %bb2 ], [ %tmp, %bb1 ]
2095 store volatile <2 x i32> %tmp12, ptr addrspace(1) undef
2099 define void @crash_lshlrevb16_not_reg_op() {
2100 ; NOSDWA-LABEL: crash_lshlrevb16_not_reg_op:
2101 ; NOSDWA: ; %bb.0: ; %bb0
2102 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2103 ; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
2104 ; NOSDWA-NEXT: v_mov_b32_e32 v0, 0x100
2105 ; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
2106 ; NOSDWA-NEXT: .LBB22_1: ; %bb1
2107 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
2108 ; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3
2109 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s4
2110 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s5
2111 ; NOSDWA-NEXT: s_mov_b64 s[4:5], 1
2112 ; NOSDWA-NEXT: v_lshrrev_b16_e32 v3, s6, v0
2113 ; NOSDWA-NEXT: flat_store_byte v[1:2], v3
2114 ; NOSDWA-NEXT: s_mov_b64 vcc, vcc
2115 ; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1
2116 ; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock
2117 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2118 ; NOSDWA-NEXT: s_setpc_b64 s[30:31]
2120 ; GFX89-LABEL: crash_lshlrevb16_not_reg_op:
2121 ; GFX89: ; %bb.0: ; %bb0
2122 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2123 ; GFX89-NEXT: s_mov_b64 s[4:5], 0
2124 ; GFX89-NEXT: v_mov_b32_e32 v0, 0x100
2125 ; GFX89-NEXT: s_and_b64 vcc, exec, -1
2126 ; GFX89-NEXT: .LBB22_1: ; %bb1
2127 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
2128 ; GFX89-NEXT: s_lshl_b32 s6, s4, 3
2129 ; GFX89-NEXT: v_mov_b32_e32 v1, s4
2130 ; GFX89-NEXT: v_mov_b32_e32 v2, s5
2131 ; GFX89-NEXT: s_mov_b64 s[4:5], 1
2132 ; GFX89-NEXT: v_lshrrev_b16_e32 v3, s6, v0
2133 ; GFX89-NEXT: flat_store_byte v[1:2], v3
2134 ; GFX89-NEXT: s_mov_b64 vcc, vcc
2135 ; GFX89-NEXT: s_cbranch_vccnz .LBB22_1
2136 ; GFX89-NEXT: ; %bb.2: ; %DummyReturnBlock
2137 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2138 ; GFX89-NEXT: s_setpc_b64 s[30:31]
2140 ; GFX9-LABEL: crash_lshlrevb16_not_reg_op:
2141 ; GFX9: ; %bb.0: ; %bb0
2142 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2143 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2144 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x100
2145 ; GFX9-NEXT: s_and_b64 vcc, exec, -1
2146 ; GFX9-NEXT: .LBB22_1: ; %bb1
2147 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2148 ; GFX9-NEXT: s_lshl_b32 s6, s4, 3
2149 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2150 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
2151 ; GFX9-NEXT: s_mov_b64 s[4:5], 1
2152 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, s6, v0
2153 ; GFX9-NEXT: flat_store_byte v[1:2], v3
2154 ; GFX9-NEXT: s_mov_b64 vcc, vcc
2155 ; GFX9-NEXT: s_cbranch_vccnz .LBB22_1
2156 ; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock
2157 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2158 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2160 ; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
2161 ; GFX10: ; %bb.0: ; %bb0
2162 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2163 ; GFX10-NEXT: s_mov_b64 s[4:5], 0
2164 ; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
2165 ; GFX10-NEXT: .LBB22_1: ; %bb1
2166 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2167 ; GFX10-NEXT: s_lshl_b32 s6, s4, 3
2168 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
2169 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
2170 ; GFX10-NEXT: v_lshrrev_b16 v2, s6, 0x100
2171 ; GFX10-NEXT: s_mov_b64 s[4:5], 1
2172 ; GFX10-NEXT: flat_store_byte v[0:1], v2
2173 ; GFX10-NEXT: s_cbranch_vccnz .LBB22_1
2174 ; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock
2175 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2176 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2177 %1 = alloca [2 x i8], align 1, addrspace(5)
2178 %2 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 1
2182 store i8 1, ptr addrspace(5) %2, align 1
2186 %3 = phi i64 [ 1, %bb1 ], [ 0, %bb0 ]
2187 %4 = trunc i64 %3 to i32
2188 %5 = getelementptr i8, ptr addrspace(5) %1, i32 %4
2189 %6 = load i8, ptr addrspace(5) %5, align 1
2190 %7 = getelementptr i8, ptr null, i64 %3
2191 store i8 %6, ptr %7, align 1
2192 br i1 false, label %bb2, label %bb1
2198 define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
2199 ; NOSDWA-LABEL: mac_v2half_same_srcop:
2200 ; NOSDWA: ; %bb.0: ; %entry
2201 ; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2202 ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2203 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
2204 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
2205 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
2206 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
2207 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
2208 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
2209 ; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
2210 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
2211 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
2212 ; NOSDWA-NEXT: s_waitcnt vmcnt(1)
2213 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
2214 ; NOSDWA-NEXT: s_waitcnt vmcnt(0)
2215 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2216 ; NOSDWA-NEXT: v_mac_f16_e32 v5, v4, v4
2217 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v5
2218 ; NOSDWA-NEXT: v_mac_f16_e32 v3, v2, v2
2219 ; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v4
2220 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2
2221 ; NOSDWA-NEXT: s_endpgm
2223 ; GFX89-LABEL: mac_v2half_same_srcop:
2224 ; GFX89: ; %bb.0: ; %entry
2225 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2226 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2227 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
2228 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
2229 ; GFX89-NEXT: v_mov_b32_e32 v1, s7
2230 ; GFX89-NEXT: v_mov_b32_e32 v2, s0
2231 ; GFX89-NEXT: v_mov_b32_e32 v3, s1
2232 ; GFX89-NEXT: flat_load_dword v4, v[0:1]
2233 ; GFX89-NEXT: flat_load_dword v2, v[2:3]
2234 ; GFX89-NEXT: v_mov_b32_e32 v0, s4
2235 ; GFX89-NEXT: v_mov_b32_e32 v1, s5
2236 ; GFX89-NEXT: s_waitcnt vmcnt(1)
2237 ; GFX89-NEXT: v_lshrrev_b32_e32 v3, 16, v4
2238 ; GFX89-NEXT: s_waitcnt vmcnt(0)
2239 ; GFX89-NEXT: v_mac_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2240 ; GFX89-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2241 ; GFX89-NEXT: v_mac_f16_e32 v4, v2, v2
2242 ; GFX89-NEXT: v_or_b32_e32 v2, v4, v3
2243 ; GFX89-NEXT: flat_store_dword v[0:1], v2
2244 ; GFX89-NEXT: s_endpgm
2246 ; GFX9-LABEL: mac_v2half_same_srcop:
2247 ; GFX9: ; %bb.0: ; %entry
2248 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2249 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2250 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2251 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2252 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
2253 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2254 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2255 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1
2256 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2257 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2
2258 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
2259 ; GFX9-NEXT: s_endpgm
2261 ; GFX10-LABEL: mac_v2half_same_srcop:
2262 ; GFX10: ; %bb.0: ; %entry
2263 ; GFX10-NEXT: s_clause 0x1
2264 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2265 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2266 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2267 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2268 ; GFX10-NEXT: s_clause 0x1
2269 ; GFX10-NEXT: global_load_dword v1, v0, s[0:1]
2270 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
2271 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2272 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1
2273 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2274 ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2
2275 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
2276 ; GFX10-NEXT: s_endpgm
2278 %a = load <2 x half>, ptr addrspace(1) %ina, align 4
2279 %b = load <2 x half>, ptr addrspace(1) %inb, align 4
2280 %mul = fmul <2 x half> %b, %b
2281 %mac = fadd <2 x half> %mul, %a
2282 store <2 x half> %mac, ptr addrspace(1) %out, align 4
2286 declare i32 @llvm.amdgcn.workitem.id.x()
2288 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
2289 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: