1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
5 define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
6 ; GFX10-LABEL: shuffle6766:
8 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; GFX10-NEXT: global_load_dword v0, v[2:3], off
10 ; GFX10-NEXT: s_waitcnt vmcnt(0)
11 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706
12 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
13 ; GFX10-NEXT: s_setpc_b64 s[30:31]
15 ; GFX9-LABEL: shuffle6766:
17 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18 ; GFX9-NEXT: global_load_dword v0, v[2:3], off
19 ; GFX9-NEXT: s_mov_b32 s4, 0x6060706
20 ; GFX9-NEXT: s_waitcnt vmcnt(0)
21 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
22 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
23 ; GFX9-NEXT: s_waitcnt vmcnt(0)
24 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
26 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
27 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 6, i32 6>
28 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
32 define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
33 ; GFX10-LABEL: shuffle3744:
35 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
37 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
38 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x307
40 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
41 ; GFX10-NEXT: s_setpc_b64 s[30:31]
43 ; GFX9-LABEL: shuffle3744:
45 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
47 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
48 ; GFX9-NEXT: s_movk_i32 s4, 0x307
49 ; GFX9-NEXT: s_waitcnt vmcnt(0)
50 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
51 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
52 ; GFX9-NEXT: s_waitcnt vmcnt(0)
53 ; GFX9-NEXT: s_setpc_b64 s[30:31]
54 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
55 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
56 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 7, i32 4, i32 4>
57 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
61 define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
62 ; GFX10-LABEL: shuffle4445:
64 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX10-NEXT: global_load_dword v0, v[2:3], off
66 ; GFX10-NEXT: s_waitcnt vmcnt(0)
67 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040404
68 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
69 ; GFX10-NEXT: s_setpc_b64 s[30:31]
71 ; GFX9-LABEL: shuffle4445:
73 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; GFX9-NEXT: global_load_dword v0, v[2:3], off
75 ; GFX9-NEXT: s_mov_b32 s4, 0x5040404
76 ; GFX9-NEXT: s_waitcnt vmcnt(0)
77 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
78 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
79 ; GFX9-NEXT: s_waitcnt vmcnt(0)
80 ; GFX9-NEXT: s_setpc_b64 s[30:31]
81 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
82 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
83 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 5>
84 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
88 define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
89 ; GFX10-LABEL: shuffle0101:
91 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
93 ; GFX10-NEXT: s_waitcnt vmcnt(0)
94 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
95 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
96 ; GFX10-NEXT: s_setpc_b64 s[30:31]
98 ; GFX9-LABEL: shuffle0101:
100 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
102 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
104 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
105 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
107 ; GFX9-NEXT: s_setpc_b64 s[30:31]
108 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
109 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
110 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
111 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
115 define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
116 ; GFX10-LABEL: shuffle1004:
118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
120 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
121 ; GFX10-NEXT: s_waitcnt vmcnt(0)
122 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x40405
123 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
124 ; GFX10-NEXT: s_setpc_b64 s[30:31]
126 ; GFX9-LABEL: shuffle1004:
128 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
130 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
131 ; GFX9-NEXT: s_mov_b32 s4, 0x40405
132 ; GFX9-NEXT: s_waitcnt vmcnt(0)
133 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
134 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: s_setpc_b64 s[30:31]
137 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
138 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
139 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4>
140 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
146 define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
147 ; GFX10-LABEL: shuffle7533:
149 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150 ; GFX10-NEXT: flat_load_dword v6, v[0:1]
151 ; GFX10-NEXT: flat_load_dword v7, v[2:3]
152 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
153 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3030507
154 ; GFX10-NEXT: flat_store_dword v[4:5], v0
155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX10-NEXT: s_setpc_b64 s[30:31]
158 ; GFX9-LABEL: shuffle7533:
160 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX9-NEXT: flat_load_dword v6, v[0:1]
162 ; GFX9-NEXT: flat_load_dword v7, v[2:3]
163 ; GFX9-NEXT: s_mov_b32 s4, 0x3030507
164 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
165 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
166 ; GFX9-NEXT: flat_store_dword v[4:5], v0
167 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
168 ; GFX9-NEXT: s_setpc_b64 s[30:31]
169 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
170 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
171 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
172 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
176 define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
177 ; GFX10-LABEL: shuffle7767:
179 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180 ; GFX10-NEXT: flat_load_dword v0, v[2:3]
181 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
182 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060707
183 ; GFX10-NEXT: flat_store_dword v[4:5], v0
184 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX10-NEXT: s_setpc_b64 s[30:31]
187 ; GFX9-LABEL: shuffle7767:
189 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; GFX9-NEXT: flat_load_dword v0, v[2:3]
191 ; GFX9-NEXT: s_mov_b32 s4, 0x7060707
192 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
193 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
194 ; GFX9-NEXT: flat_store_dword v[4:5], v0
195 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
196 ; GFX9-NEXT: s_setpc_b64 s[30:31]
197 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
198 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
199 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
200 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
204 define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
205 ; GFX10-LABEL: shuffle0554:
207 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208 ; GFX10-NEXT: ds_read_b32 v0, v0
209 ; GFX10-NEXT: ds_read_b32 v1, v1
210 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104
212 ; GFX10-NEXT: ds_write_b32 v2, v0
213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX10-NEXT: s_setpc_b64 s[30:31]
216 ; GFX9-LABEL: shuffle0554:
218 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; GFX9-NEXT: ds_read_b32 v0, v0
220 ; GFX9-NEXT: ds_read_b32 v1, v1
221 ; GFX9-NEXT: s_mov_b32 s4, 0x10104
222 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
224 ; GFX9-NEXT: ds_write_b32 v2, v0
225 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
226 ; GFX9-NEXT: s_setpc_b64 s[30:31]
227 %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
228 %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
229 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4>
230 store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
234 define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
235 ; GFX10-LABEL: shuffle2127:
237 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GFX10-NEXT: ds_read_b32 v0, v0
239 ; GFX10-NEXT: ds_read_b32 v1, v1
240 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3060506
242 ; GFX10-NEXT: ds_write_b32 v2, v0
243 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX10-NEXT: s_setpc_b64 s[30:31]
246 ; GFX9-LABEL: shuffle2127:
248 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249 ; GFX9-NEXT: ds_read_b32 v0, v0
250 ; GFX9-NEXT: ds_read_b32 v1, v1
251 ; GFX9-NEXT: s_mov_b32 s4, 0x3060506
252 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
254 ; GFX9-NEXT: ds_write_b32 v2, v0
255 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX9-NEXT: s_setpc_b64 s[30:31]
257 %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
258 %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
259 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 1, i32 2, i32 7>
260 store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
264 define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) {
265 ; GFX10-LABEL: shuffle5047:
267 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268 ; GFX10-NEXT: s_clause 0x1
269 ; GFX10-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
270 ; GFX10-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen
271 ; GFX10-NEXT: s_waitcnt vmcnt(0)
272 ; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0x7040005
273 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
274 ; GFX10-NEXT: s_setpc_b64 s[30:31]
276 ; GFX9-LABEL: shuffle5047:
278 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279 ; GFX9-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
280 ; GFX9-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen
281 ; GFX9-NEXT: s_mov_b32 s4, 0x7040005
282 ; GFX9-NEXT: s_waitcnt vmcnt(0)
283 ; GFX9-NEXT: v_perm_b32 v0, v4, v3, s4
284 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_setpc_b64 s[30:31]
287 %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4
288 %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4
289 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7>
290 store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4
294 define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
295 ; GFX10-LABEL: shuffle3546:
297 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
299 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
300 ; GFX10-NEXT: s_waitcnt vmcnt(0)
301 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2000107
302 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
303 ; GFX10-NEXT: s_setpc_b64 s[30:31]
305 ; GFX9-LABEL: shuffle3546:
307 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
309 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
310 ; GFX9-NEXT: s_mov_b32 s4, 0x2000107
311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
312 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
313 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
314 ; GFX9-NEXT: s_waitcnt vmcnt(0)
315 ; GFX9-NEXT: s_setpc_b64 s[30:31]
316 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
317 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
318 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 5, i32 4, i32 6>
319 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
324 define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
325 ; GFX10-LABEL: shuffle7330ud2:
327 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
329 ; GFX10-NEXT: s_waitcnt vmcnt(0)
330 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706
331 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
332 ; GFX10-NEXT: s_setpc_b64 s[30:31]
334 ; GFX9-LABEL: shuffle7330ud2:
336 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
338 ; GFX9-NEXT: s_mov_b32 s4, 0x4070706
339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
340 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
341 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
342 ; GFX9-NEXT: s_waitcnt vmcnt(0)
343 ; GFX9-NEXT: s_setpc_b64 s[30:31]
344 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
345 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0>
346 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
350 define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
351 ; GFX10-LABEL: shuffle5341ud2:
353 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
355 ; GFX10-NEXT: s_waitcnt vmcnt(0)
356 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
357 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
358 ; GFX10-NEXT: s_setpc_b64 s[30:31]
360 ; GFX9-LABEL: shuffle5341ud2:
362 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
365 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16
366 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
367 ; GFX9-NEXT: s_waitcnt vmcnt(0)
368 ; GFX9-NEXT: s_setpc_b64 s[30:31]
369 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
370 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 5, i32 3, i32 4, i32 1>
371 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
375 define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
376 ; GFX10-LABEL: shuffle6106ud2:
378 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
380 ; GFX10-NEXT: s_waitcnt vmcnt(0)
381 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
382 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
383 ; GFX10-NEXT: s_setpc_b64 s[30:31]
385 ; GFX9-LABEL: shuffle6106ud2:
387 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
389 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
390 ; GFX9-NEXT: s_waitcnt vmcnt(0)
391 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
392 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
393 ; GFX9-NEXT: s_waitcnt vmcnt(0)
394 ; GFX9-NEXT: s_setpc_b64 s[30:31]
395 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
396 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6>
397 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
402 define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
403 ; GFX10-LABEL: shuffle4327ud2:
405 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
407 ; GFX10-NEXT: s_waitcnt vmcnt(0)
408 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
409 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
410 ; GFX10-NEXT: s_setpc_b64 s[30:31]
412 ; GFX9-LABEL: shuffle4327ud2:
414 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
416 ; GFX9-NEXT: s_mov_b32 s4, 0x7060706
417 ; GFX9-NEXT: s_waitcnt vmcnt(0)
418 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
419 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
421 ; GFX9-NEXT: s_setpc_b64 s[30:31]
422 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
423 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 4, i32 3, i32 2, i32 7>
424 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
428 define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
429 ; GFX10-LABEL: shuffle3263ud2:
431 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
433 ; GFX10-NEXT: s_waitcnt vmcnt(0)
434 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607
435 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
436 ; GFX10-NEXT: s_setpc_b64 s[30:31]
438 ; GFX9-LABEL: shuffle3263ud2:
440 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
442 ; GFX9-NEXT: s_mov_b32 s4, 0x7060607
443 ; GFX9-NEXT: s_waitcnt vmcnt(0)
444 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
445 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
447 ; GFX9-NEXT: s_setpc_b64 s[30:31]
448 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
449 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 6, i32 3>
450 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
454 define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
455 ; GFX10-LABEL: shuffle2763ud2:
457 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
459 ; GFX10-NEXT: s_waitcnt vmcnt(0)
460 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
461 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
462 ; GFX10-NEXT: s_setpc_b64 s[30:31]
464 ; GFX9-LABEL: shuffle2763ud2:
466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
468 ; GFX9-NEXT: s_mov_b32 s4, 0x7060706
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
471 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
473 ; GFX9-NEXT: s_setpc_b64 s[30:31]
474 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
475 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 2, i32 7, i32 6, i32 3>
476 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
480 define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
481 ; GFX10-LABEL: shuffle1327ud2:
483 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
485 ; GFX10-NEXT: s_waitcnt vmcnt(0)
486 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060705
487 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
488 ; GFX10-NEXT: s_setpc_b64 s[30:31]
490 ; GFX9-LABEL: shuffle1327ud2:
492 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
494 ; GFX9-NEXT: s_mov_b32 s4, 0x7060705
495 ; GFX9-NEXT: s_waitcnt vmcnt(0)
496 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
497 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
498 ; GFX9-NEXT: s_waitcnt vmcnt(0)
499 ; GFX9-NEXT: s_setpc_b64 s[30:31]
500 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
501 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 7>
502 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
506 define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
507 ; GFX10-LABEL: shuffle0605ud2:
509 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
511 ; GFX10-NEXT: s_waitcnt vmcnt(0)
512 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
513 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
514 ; GFX10-NEXT: s_setpc_b64 s[30:31]
516 ; GFX9-LABEL: shuffle0605ud2:
518 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
520 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
521 ; GFX9-NEXT: s_waitcnt vmcnt(0)
522 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
523 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
524 ; GFX9-NEXT: s_waitcnt vmcnt(0)
525 ; GFX9-NEXT: s_setpc_b64 s[30:31]
526 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
527 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5>
528 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
532 define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
533 ; GFX10-LABEL: insertUsesOr:
535 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
537 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4
538 ; GFX10-NEXT: s_waitcnt vmcnt(0)
539 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
540 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
541 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
542 ; GFX10-NEXT: s_setpc_b64 s[30:31]
544 ; GFX9-LABEL: insertUsesOr:
546 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
548 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v4
549 ; GFX9-NEXT: s_waitcnt vmcnt(0)
550 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
551 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
552 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
553 ; GFX9-NEXT: s_waitcnt vmcnt(0)
554 ; GFX9-NEXT: s_setpc_b64 s[30:31]
555 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
556 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
557 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
558 %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
559 store <4 x i8> %vecins, ptr addrspace(1) %out0
563 define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
564 ; GFX10-LABEL: addUsesOr:
566 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
568 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
569 ; GFX10-NEXT: s_waitcnt vmcnt(1)
570 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
571 ; GFX10-NEXT: s_waitcnt vmcnt(0)
572 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v7
573 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4
574 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v7
575 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
576 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7
577 ; GFX10-NEXT: v_add_nc_u16 v2, v2, v3
578 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
579 ; GFX10-NEXT: v_add_nc_u16 v1, v4, v1
580 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
581 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
582 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
583 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
584 ; GFX10-NEXT: s_setpc_b64 s[30:31]
586 ; GFX9-LABEL: addUsesOr:
588 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
590 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
592 ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
593 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
594 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
595 ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1
596 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
597 ; GFX9-NEXT: s_waitcnt vmcnt(0)
598 ; GFX9-NEXT: s_setpc_b64 s[30:31]
599 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
600 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
601 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 0, i32 6, i32 3>
602 %added = add <4 x i8> %shuffle0_0, %vec1
603 store <4 x i8> %added, ptr addrspace(1) %out0
608 define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 {
609 ; GFX10-LABEL: shuffle8i8:
610 ; GFX10: ; %bb.0: ; %bb
611 ; GFX10-NEXT: s_clause 0x1
612 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
613 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
614 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
615 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
616 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
617 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
618 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
619 ; GFX10-NEXT: s_lshr_b32 s1, s1, 8
620 ; GFX10-NEXT: s_lshr_b32 s4, s9, 16
621 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9
622 ; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8
623 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4
624 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8
625 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
626 ; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
627 ; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
628 ; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
629 ; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
630 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
631 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
632 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
633 ; GFX10-NEXT: s_endpgm
635 ; GFX9-LABEL: shuffle8i8:
636 ; GFX9: ; %bb.0: ; %bb
637 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
638 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
639 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00
640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
641 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
642 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
643 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
644 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
645 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8
646 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9
647 ; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
648 ; GFX9-NEXT: s_lshr_b32 s1, s9, 16
649 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
650 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8
651 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0
652 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1
653 ; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
654 ; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
655 ; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
656 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
657 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
658 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
659 ; GFX9-NEXT: s_endpgm
661 %vec0 = load <8 x i8>, ptr addrspace(1) %in0
662 %vec1 = load <8 x i8>, ptr addrspace(1) %in1
663 %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9>
664 store <8 x i8> %shuffle0, ptr addrspace(1) %out1
668 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
669 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
671 ; Not combined to perm due to non-vectorized use, non-divergent
672 define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
675 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
677 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
678 ; GFX10-NEXT: s_waitcnt vmcnt(1)
679 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
680 ; GFX10-NEXT: s_waitcnt vmcnt(0)
681 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7
682 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v7
683 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4
684 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v7
685 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
686 ; GFX10-NEXT: v_add_nc_u16 v2, v7, v2
687 ; GFX10-NEXT: v_add_nc_u16 v3, v3, v7
688 ; GFX10-NEXT: v_add_nc_u16 v1, v1, v4
689 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
690 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
691 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
692 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
693 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
694 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
695 ; GFX10-NEXT: s_setpc_b64 s[30:31]
699 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
700 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
701 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
702 ; GFX9-NEXT: s_waitcnt vmcnt(1)
703 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4
704 ; GFX9-NEXT: s_waitcnt vmcnt(0)
705 ; GFX9-NEXT: v_add_u16_sdwa v1, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
706 ; GFX9-NEXT: v_add_u16_sdwa v2, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
707 ; GFX9-NEXT: v_add_u16_sdwa v3, v7, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
708 ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
709 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
710 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
711 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
712 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
713 ; GFX9-NEXT: s_waitcnt vmcnt(0)
714 ; GFX9-NEXT: s_setpc_b64 s[30:31]
715 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
716 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
717 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
718 %vecins = add <4 x i8> %shuffle0_0, %vec1
719 store <4 x i8> %vecins, ptr addrspace(1) %out0
723 ; Not combined to perm due to non-vectorized use
724 define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
725 ; GFX10-LABEL: add_div:
727 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
729 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
730 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
731 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
732 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
733 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
734 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
735 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
736 ; GFX10-NEXT: s_waitcnt vmcnt(1)
737 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
738 ; GFX10-NEXT: s_waitcnt vmcnt(0)
739 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7
740 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
741 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4
742 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
743 ; GFX10-NEXT: v_add_nc_u16 v1, v1, v7
744 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
745 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
746 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
747 ; GFX10-NEXT: s_setpc_b64 s[30:31]
749 ; GFX9-LABEL: add_div:
751 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
753 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
754 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
755 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
756 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
757 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
758 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
759 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
760 ; GFX9-NEXT: s_waitcnt vmcnt(0)
761 ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
762 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
763 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
764 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
765 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
767 ; GFX9-NEXT: s_setpc_b64 s[30:31]
768 %tid = call i32 @llvm.amdgcn.workitem.id.x()
769 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
770 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
771 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
772 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
773 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
774 %vecins = add <4 x i8> %shuffle0_0, %vec1
775 store <4 x i8> %vecins, ptr addrspace(1) %out0
779 ; Not combined to perm due to non-divergent use
780 define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
781 ; GFX10-LABEL: add_store:
783 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
784 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
785 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
786 ; GFX10-NEXT: s_waitcnt vmcnt(1)
787 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
788 ; GFX10-NEXT: s_waitcnt vmcnt(0)
789 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
790 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
791 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
792 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
793 ; GFX10-NEXT: v_add_nc_u16 v3, v2, v9
794 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
795 ; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
796 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
797 ; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
798 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
799 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
800 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
801 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
802 ; GFX10-NEXT: s_setpc_b64 s[30:31]
804 ; GFX9-LABEL: add_store:
806 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
808 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
809 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
810 ; GFX9-NEXT: s_waitcnt vmcnt(1)
811 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4
812 ; GFX9-NEXT: v_and_b32_sdwa v1, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
813 ; GFX9-NEXT: s_waitcnt vmcnt(0)
814 ; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
815 ; GFX9-NEXT: v_or_b32_e32 v1, v0, v1
816 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v9
817 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
818 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
819 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
820 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
821 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
822 ; GFX9-NEXT: s_waitcnt vmcnt(0)
823 ; GFX9-NEXT: s_setpc_b64 s[30:31]
824 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
825 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
826 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
827 %vecins = add <4 x i8> %shuffle0_0, %vec1
828 store <4 x i8> %vecins, ptr addrspace(1) %out0
829 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
833 ; Not combined to perm due to 16 bit or
834 define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
835 ; GFX10-LABEL: add_store_div_16:
837 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
838 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
839 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
840 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
841 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
842 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
843 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
844 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
845 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
846 ; GFX10-NEXT: s_waitcnt vmcnt(1)
847 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
848 ; GFX10-NEXT: s_waitcnt vmcnt(0)
849 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
850 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
851 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
852 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
853 ; GFX10-NEXT: v_add_nc_u16 v3, v2, v9
854 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
855 ; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
856 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
857 ; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
858 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
859 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
860 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
861 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
862 ; GFX10-NEXT: s_setpc_b64 s[30:31]
864 ; GFX9-LABEL: add_store_div_16:
866 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
867 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
868 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
869 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
870 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
871 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
872 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
873 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
874 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
875 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
876 ; GFX9-NEXT: s_waitcnt vmcnt(1)
877 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9
878 ; GFX9-NEXT: v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
879 ; GFX9-NEXT: v_or_b32_e32 v2, v1, v2
880 ; GFX9-NEXT: s_waitcnt vmcnt(0)
881 ; GFX9-NEXT: v_add_u16_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
882 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
883 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
884 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
885 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
886 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
887 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
888 ; GFX9-NEXT: s_waitcnt vmcnt(0)
889 ; GFX9-NEXT: s_setpc_b64 s[30:31]
890 %tid = call i32 @llvm.amdgcn.workitem.id.x()
891 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
892 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
893 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
894 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
895 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
896 %vecins = add <4 x i8> %shuffle0_0, %vec1
897 store <4 x i8> %vecins, ptr addrspace(1) %out0
898 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
902 ; Vectorized use, divergent, 32 bit or
903 define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
904 ; GFX10-LABEL: add_store_div:
906 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
908 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
909 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
910 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
911 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
912 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
913 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
914 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
915 ; GFX10-NEXT: s_waitcnt vmcnt(1)
916 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
917 ; GFX10-NEXT: s_waitcnt vmcnt(0)
918 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
919 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v9
920 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4
921 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v9
922 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
923 ; GFX10-NEXT: v_add_nc_u16 v2, v9, v2
924 ; GFX10-NEXT: v_add_nc_u16 v3, v3, v9
925 ; GFX10-NEXT: v_add_nc_u16 v1, v1, v10
926 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
927 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
928 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
929 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
930 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
931 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x10705
932 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
933 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
934 ; GFX10-NEXT: s_setpc_b64 s[30:31]
936 ; GFX9-LABEL: add_store_div:
938 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
939 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
940 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
941 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
942 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
943 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
944 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
945 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
946 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
947 ; GFX9-NEXT: s_mov_b32 s4, 0x10705
948 ; GFX9-NEXT: s_waitcnt vmcnt(1)
949 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4
950 ; GFX9-NEXT: s_waitcnt vmcnt(0)
951 ; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
952 ; GFX9-NEXT: v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
953 ; GFX9-NEXT: v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
954 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
955 ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
956 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
957 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
958 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
959 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
960 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
961 ; GFX9-NEXT: s_waitcnt vmcnt(0)
962 ; GFX9-NEXT: s_setpc_b64 s[30:31]
963 %tid = call i32 @llvm.amdgcn.workitem.id.x()
964 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
965 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
966 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
967 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
968 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
969 %vecins = add <4 x i8> %shuffle0_0, %vec1
970 store <4 x i8> %vecins, ptr addrspace(1) %out0
971 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
975 define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
976 ; GFX10-LABEL: and_store_div:
978 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
979 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
980 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
981 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
982 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
983 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
984 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
985 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
986 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
987 ; GFX10-NEXT: v_mov_b32_e32 v0, 2
988 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
989 ; GFX10-NEXT: s_waitcnt vmcnt(1)
990 ; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
991 ; GFX10-NEXT: s_waitcnt vmcnt(0)
992 ; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
993 ; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9
994 ; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
995 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
996 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
997 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
998 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006
999 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1000 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1001 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1003 ; GFX9-LABEL: and_store_div:
1005 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1007 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1008 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1009 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1010 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1011 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1012 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1013 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1014 ; GFX9-NEXT: s_mov_b32 s4, 0x5070006
1015 ; GFX9-NEXT: v_mov_b32_e32 v0, 2
1016 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
1017 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1018 ; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1019 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1020 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
1021 ; GFX9-NEXT: v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1022 ; GFX9-NEXT: v_and_b32_e32 v9, 0x100, v4
1023 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1024 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1025 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1026 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1027 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1028 ; GFX9-NEXT: global_store_dword v[7:8], v2, off
1029 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1030 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1031 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1032 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1033 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1034 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1035 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1036 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 4, i32 3, i32 1>
1037 %vecins = and <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1038 store <4 x i8> %vecins, ptr addrspace(1) %out0
1039 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1043 define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1044 ; GFX10-LABEL: ashr_store_div:
1046 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1049 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1050 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1051 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1052 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4
1053 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1054 ; GFX10-NEXT: v_mov_b32_e32 v2, 26
1055 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1056 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1057 ; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8
1058 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 24, v9
1059 ; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1060 ; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1
1061 ; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
1062 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1063 ; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0
1064 ; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707
1065 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1066 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1067 ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1068 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1069 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
1070 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
1071 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1073 ; GFX9-LABEL: ashr_store_div:
1075 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1077 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1078 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1079 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1080 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1081 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1082 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1083 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1084 ; GFX9-NEXT: v_mov_b32_e32 v0, 26
1085 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
1086 ; GFX9-NEXT: v_mov_b32_e32 v2, 7
1087 ; GFX9-NEXT: s_mov_b32 s4, 0x4010707
1088 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1089 ; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1090 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1091 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1092 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4
1094 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9
1095 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1096 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v2
1097 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1098 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1099 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1100 ; GFX9-NEXT: global_store_dword v[7:8], v3, off
1101 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1102 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1103 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1104 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1105 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1106 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1107 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1108 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0>
1109 %vecins = ashr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1110 store <4 x i8> %vecins, ptr addrspace(1) %out0
1111 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1115 define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1116 ; GFX10-LABEL: bc_store_div:
1118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1119 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1120 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1121 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1122 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1123 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1124 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1125 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1126 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1127 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1128 ; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x7060104
1129 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
1130 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1131 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1133 ; GFX9-LABEL: bc_store_div:
1135 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1137 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1138 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1139 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1140 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1141 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1142 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1143 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1144 ; GFX9-NEXT: s_mov_b32 s4, 0x7060104
1145 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1146 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
1147 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1148 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1149 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1150 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1151 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1152 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1153 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1154 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1155 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1156 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1157 %insvec = bitcast <4 x i8> %shuffle0_0 to i32
1158 store i32 %insvec, ptr addrspace(1) %out1
1159 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1164 define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
1165 ; GFX10-LABEL: eve_store_div:
1167 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1168 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1169 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1170 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1171 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1172 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1173 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1174 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1175 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1176 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1177 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
1178 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1179 ; GFX10-NEXT: v_perm_b32 v1, v5, v4, 0x1020305
1180 ; GFX10-NEXT: global_store_byte v[9:10], v0, off
1181 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1182 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1184 ; GFX9-LABEL: eve_store_div:
1186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1188 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1189 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1190 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1191 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1192 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1193 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1194 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1195 ; GFX9-NEXT: s_mov_b32 s4, 0x1020305
1196 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1197 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4
1198 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1199 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
1200 ; GFX9-NEXT: global_store_byte v[9:10], v1, off
1201 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1202 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1203 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1204 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1205 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1206 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1207 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1208 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1209 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 3, i32 2, i32 1>
1210 %tmp = extractelement <4 x i8> %shuffle0_0, i32 1
1211 store i8 %tmp, ptr addrspace(1) %out2
1212 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1216 ; Not combined to perm due to multi use of or operands (introduced by insert op)
1217 define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1218 ; GFX10-LABEL: ive_store_div:
1220 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1221 ; GFX10-NEXT: v_and_b32_e32 v9, 0x3ff, v31
1222 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 2, v9
1223 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v9
1224 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1225 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v9
1226 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1227 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1228 ; GFX10-NEXT: global_load_dword v10, v[2:3], off
1229 ; GFX10-NEXT: v_mov_b32_e32 v0, 16
1230 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
1231 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v4
1232 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1233 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1234 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1235 ; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1236 ; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1237 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
1238 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1239 ; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
1240 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1241 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1242 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1244 ; GFX9-LABEL: ive_store_div:
1246 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1247 ; GFX9-NEXT: v_and_b32_e32 v9, 0x3ff, v31
1248 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 2, v9
1249 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v9
1250 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1251 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9
1252 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1253 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1254 ; GFX9-NEXT: global_load_dword v10, v[2:3], off
1255 ; GFX9-NEXT: s_movk_i32 s4, 0xff
1256 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
1257 ; GFX9-NEXT: s_mov_b32 s5, 0x2000706
1258 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1259 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1261 ; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1262 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1263 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
1264 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1265 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1266 ; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5
1267 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1268 ; GFX9-NEXT: global_store_dword v[7:8], v3, off
1269 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1270 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1271 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1272 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1273 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1274 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1275 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1276 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2>
1277 %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
1278 store <4 x i8> %vecins, ptr addrspace(1) %out0
1279 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1284 define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1285 ; GFX10-LABEL: lhsr_store_div:
1287 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1288 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1289 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1290 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1291 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1292 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1293 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1294 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1295 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1296 ; GFX10-NEXT: v_mov_b32_e32 v0, 26
1297 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1298 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4
1299 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1300 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1301 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9
1302 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4
1303 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1
1304 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
1305 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1306 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1307 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707
1308 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1309 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1310 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1312 ; GFX9-LABEL: lhsr_store_div:
1314 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1315 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1316 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1317 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1318 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1319 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1320 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1321 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1322 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1323 ; GFX9-NEXT: v_mov_b32_e32 v0, 26
1324 ; GFX9-NEXT: s_mov_b32 s4, 0x1030707
1325 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1326 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, 1, v4
1327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1328 ; GFX9-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1329 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 25, v9
1330 ; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
1331 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 26, v4
1332 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
1333 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f00, v3
1334 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1335 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1336 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1337 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
1338 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1339 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1340 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1341 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1342 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1343 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1344 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1345 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
1346 %vecins = lshr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1347 store <4 x i8> %vecins, ptr addrspace(1) %out0
1348 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1353 define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1354 ; GFX10-LABEL: mul_store_div:
1356 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1357 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1358 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1359 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1360 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1361 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1362 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1363 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1364 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1365 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1366 ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v4
1367 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1368 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v9
1369 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v9
1370 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v9
1371 ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v2
1372 ; GFX10-NEXT: v_mul_lo_u16 v1, v3, v1
1373 ; GFX10-NEXT: v_mul_lo_u16 v2, v4, v9
1374 ; GFX10-NEXT: v_mul_lo_u16 v3, v9, v3
1375 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
1376 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
1377 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1378 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1379 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1380 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2000504
1381 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1382 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1383 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1385 ; GFX9-LABEL: mul_store_div:
1387 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1388 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1389 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1390 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1391 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1392 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1393 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1394 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
1395 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1396 ; GFX9-NEXT: s_mov_b32 s4, 0x2000504
1397 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1398 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1399 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1400 ; GFX9-NEXT: v_mul_lo_u16_e32 v2, v9, v4
1401 ; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1402 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1403 ; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1404 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v4, v0
1405 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1406 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1407 ; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
1408 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1409 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
1410 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1411 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1412 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1413 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1414 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1415 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1416 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1417 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6>
1418 %vecins = mul <4 x i8> %shuffle0_0, %vec1
1419 store <4 x i8> %vecins, ptr addrspace(1) %out0
1420 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1425 define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1426 ; GFX10-LABEL: or_store_div:
1428 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1429 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1430 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1431 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1432 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1433 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1434 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1435 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1436 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1437 ; GFX10-NEXT: v_mov_b32_e32 v0, 16
1438 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x102
1439 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1440 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4
1441 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1442 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1443 ; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1444 ; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1445 ; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1
1446 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1447 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1448 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005
1449 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1450 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1451 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1453 ; GFX9-LABEL: or_store_div:
1455 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1456 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1457 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1458 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1459 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1460 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
1461 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1462 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1463 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1464 ; GFX9-NEXT: s_mov_b32 s4, 0x2010005
1465 ; GFX9-NEXT: s_movk_i32 s5, 0x102
1466 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1467 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1468 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2
1469 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1470 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1471 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1473 ; GFX9-NEXT: v_perm_b32 v4, v0, v2, s4
1474 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1475 ; GFX9-NEXT: v_or_b32_e32 v0, 0x201, v0
1476 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1477 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1478 ; GFX9-NEXT: global_store_dword v[7:8], v4, off
1479 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1480 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1481 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1482 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1483 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1484 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1485 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1486 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
1487 %vecins = or <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1488 store <4 x i8> %vecins, ptr addrspace(1) %out0
1489 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1493 define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1494 ; GFX10-LABEL: sdiv_store_div:
1496 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1497 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1498 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1499 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1500 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1501 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1502 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1503 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1504 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1505 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1506 ; GFX10-NEXT: v_bfe_i32 v0, v4, 0, 8
1507 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1508 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 24, v9
1509 ; GFX10-NEXT: v_bfe_i32 v3, v4, 8, 8
1510 ; GFX10-NEXT: v_bfe_i32 v1, v9, 16, 8
1511 ; GFX10-NEXT: v_bfe_i32 v10, v4, 16, 8
1512 ; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v0
1513 ; GFX10-NEXT: v_ashrrev_i32_e32 v11, 24, v4
1514 ; GFX10-NEXT: v_xor_b32_e32 v15, v2, v3
1515 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3
1516 ; GFX10-NEXT: v_xor_b32_e32 v12, v1, v0
1517 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v13
1518 ; GFX10-NEXT: v_cvt_f32_i32_e32 v14, v1
1519 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v10
1520 ; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10
1521 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v3
1522 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v11
1523 ; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11
1524 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
1525 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v10
1526 ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12
1527 ; GFX10-NEXT: v_mul_f32_e32 v16, v14, v16
1528 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11
1529 ; GFX10-NEXT: v_ashrrev_i32_e32 v15, 30, v15
1530 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1531 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17
1532 ; GFX10-NEXT: v_trunc_f32_e32 v16, v16
1533 ; GFX10-NEXT: v_or_b32_e32 v12, 1, v12
1534 ; GFX10-NEXT: v_or_b32_e32 v15, 1, v15
1535 ; GFX10-NEXT: v_mul_f32_e32 v18, v14, v18
1536 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17
1537 ; GFX10-NEXT: v_mad_f32 v20, -v16, v13, v14
1538 ; GFX10-NEXT: v_mul_f32_e32 v19, v13, v19
1539 ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1540 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18
1541 ; GFX10-NEXT: v_mad_f32 v2, -v17, v3, v2
1542 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13|
1543 ; GFX10-NEXT: v_trunc_f32_e32 v19, v19
1544 ; GFX10-NEXT: v_or_b32_e32 v1, 1, v1
1545 ; GFX10-NEXT: v_mad_f32 v14, -v18, v10, v14
1546 ; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
1547 ; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
1548 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3|
1549 ; GFX10-NEXT: v_mad_f32 v21, -v19, v11, v13
1550 ; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16
1551 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
1552 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
1553 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v15, vcc_lo
1554 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10|
1555 ; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19
1556 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v16, v12
1557 ; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1558 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
1559 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11|
1560 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1561 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1
1562 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
1563 ; GFX10-NEXT: v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1564 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1565 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706
1566 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1567 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1568 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1569 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1571 ; GFX9-LABEL: sdiv_store_div:
1573 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1575 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1576 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1577 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1578 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1579 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1580 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
1581 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1582 ; GFX9-NEXT: s_mov_b32 s4, 0x60706
1583 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1584 ; GFX9-NEXT: v_bfe_i32 v1, v4, 0, 8
1585 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1586 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
1587 ; GFX9-NEXT: v_bfe_i32 v2, v9, 16, 8
1588 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 24, v9
1589 ; GFX9-NEXT: v_bfe_i32 v9, v4, 8, 8
1590 ; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v1
1591 ; GFX9-NEXT: v_bfe_i32 v10, v4, 16, 8
1592 ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 24, v4
1593 ; GFX9-NEXT: v_xor_b32_e32 v14, v3, v9
1594 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v9
1595 ; GFX9-NEXT: v_xor_b32_e32 v11, v2, v1
1596 ; GFX9-NEXT: v_cvt_f32_i32_e32 v13, v2
1597 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v10
1598 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v10
1599 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v4
1600 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
1601 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v12
1602 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
1603 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v9
1604 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v10
1605 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4
1606 ; GFX9-NEXT: v_mul_f32_e32 v15, v13, v15
1607 ; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16
1608 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15
1609 ; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v11
1610 ; GFX9-NEXT: v_mul_f32_e32 v17, v13, v17
1611 ; GFX9-NEXT: v_mul_f32_e32 v18, v12, v18
1612 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16
1613 ; GFX9-NEXT: v_mad_f32 v19, -v15, v12, v13
1614 ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14
1615 ; GFX9-NEXT: v_or_b32_e32 v11, 1, v11
1616 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17
1617 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18
1618 ; GFX9-NEXT: v_mad_f32 v3, -v16, v9, v3
1619 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v12|
1620 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2
1621 ; GFX9-NEXT: v_or_b32_e32 v14, 1, v14
1622 ; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15
1623 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16
1624 ; GFX9-NEXT: v_mad_f32 v13, -v17, v10, v13
1625 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17
1626 ; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v12
1627 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
1628 ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
1629 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9|
1630 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1631 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
1632 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc
1633 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v10|
1634 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
1635 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
1636 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4|
1637 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1638 ; GFX9-NEXT: v_add_u32_e32 v4, v15, v11
1639 ; GFX9-NEXT: v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1640 ; GFX9-NEXT: v_add_u32_e32 v2, v17, v2
1641 ; GFX9-NEXT: v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1642 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1643 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1644 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1645 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
1646 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1647 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1648 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1649 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1650 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1651 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1652 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1653 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1654 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 3, i32 2, i32 4>
1655 %vecins = sdiv <4 x i8> %shuffle0_0, %vec1
1656 store <4 x i8> %vecins, ptr addrspace(1) %out0
1657 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1662 define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1663 ; GFX10-LABEL: sext_store_div:
1665 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1666 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1667 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1668 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1669 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1670 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1671 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1672 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1673 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1674 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1675 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1676 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1677 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1678 ; GFX10-NEXT: v_ashrrev_i16 v2, 8, v4
1679 ; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
1680 ; GFX10-NEXT: v_ashrrev_i16 v3, 8, v1
1681 ; GFX10-NEXT: v_perm_b32 v1, v0, v2, 0x5040100
1682 ; GFX10-NEXT: v_perm_b32 v0, v3, v3, 0x5040100
1683 ; GFX10-NEXT: v_perm_b32 v2, v9, v4, 0x3010707
1684 ; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
1685 ; GFX10-NEXT: global_store_dword v[5:6], v2, off
1686 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1688 ; GFX9-LABEL: sext_store_div:
1690 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1691 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1692 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1693 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1694 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1695 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1696 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1697 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1698 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1699 ; GFX9-NEXT: v_mov_b32_e32 v0, 8
1700 ; GFX9-NEXT: s_mov_b32 s5, 0x5040100
1701 ; GFX9-NEXT: s_mov_b32 s4, 0x3010707
1702 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1703 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9
1704 ; GFX9-NEXT: v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1705 ; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1706 ; GFX9-NEXT: v_perm_b32 v1, v3, v1, s5
1707 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s5
1708 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
1709 ; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
1710 ; GFX9-NEXT: global_store_dword v[5:6], v2, off
1711 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1712 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1713 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1714 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1715 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1716 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1717 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1718 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
1719 %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16>
1720 store <4 x i16> %insvec, ptr addrspace(1) %out1
1721 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1726 define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1727 ; GFX10-LABEL: shl_store_div:
1729 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1730 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1731 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1732 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1733 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1734 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1735 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1736 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1737 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1738 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1739 ; GFX10-NEXT: v_lshlrev_b16 v0, 2, v4
1740 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1741 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v9
1742 ; GFX10-NEXT: v_and_b32_e32 v2, 0xfffffc00, v0
1743 ; GFX10-NEXT: v_and_b32_e32 v3, 0xfe, v1
1744 ; GFX10-NEXT: v_and_b32_e32 v1, 0xfffffe00, v1
1745 ; GFX10-NEXT: v_and_b32_e32 v0, 0xfc, v0
1746 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
1747 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1748 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5000104
1749 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1750 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1751 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1752 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1754 ; GFX9-LABEL: shl_store_div:
1756 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1757 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1758 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1759 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1760 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1761 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1762 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1763 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1764 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1765 ; GFX9-NEXT: s_mov_b32 s4, 0x5000104
1766 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1767 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 2, v4
1768 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1769 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 1, v9
1770 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
1771 ; GFX9-NEXT: v_and_b32_e32 v3, 0xfffffc00, v1
1772 ; GFX9-NEXT: v_and_b32_e32 v4, 0xfe, v2
1773 ; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffe00, v2
1774 ; GFX9-NEXT: v_and_b32_e32 v1, 0xfc, v1
1775 ; GFX9-NEXT: v_or_b32_e32 v3, v4, v3
1776 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1777 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1778 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
1779 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1780 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1781 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1782 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1783 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1784 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1785 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1786 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1787 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5>
1788 %vecins = shl <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1789 store <4 x i8> %vecins, ptr addrspace(1) %out0
1790 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1795 define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1796 ; GFX10-LABEL: sitofp_store_div:
1798 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1799 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1800 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1801 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1802 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1803 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1804 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1805 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1806 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1807 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1808 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1809 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1810 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1811 ; GFX10-NEXT: v_ashrrev_i16 v2, 8, v9
1812 ; GFX10-NEXT: v_ashrrev_i16 v3, 8, v4
1813 ; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x6010205
1814 ; GFX10-NEXT: v_bfe_i32 v10, v0, 0, 8
1815 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
1816 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1817 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1818 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1819 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1820 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
1821 ; GFX10-NEXT: global_store_dword v[5:6], v4, off
1822 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1824 ; GFX9-LABEL: sitofp_store_div:
1826 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1827 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1828 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1829 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1830 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1831 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1832 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
1833 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
1834 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1835 ; GFX9-NEXT: s_mov_b32 s4, 0x6010205
1836 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1837 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9
1838 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9
1839 ; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 8
1840 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1841 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
1842 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 8, v4
1843 ; GFX9-NEXT: v_bfe_i32 v11, v2, 0, 8
1844 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1845 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1846 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1847 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1848 ; GFX9-NEXT: v_perm_b32 v4, v4, v9, s4
1849 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
1850 ; GFX9-NEXT: global_store_dword v[5:6], v4, off
1851 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1852 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1853 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1854 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1855 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1856 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1857 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1858 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 1, i32 6>
1859 %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float>
1860 store <4 x float> %insvec, ptr addrspace(1) %out1
1861 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1866 define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1867 ; GFX10-LABEL: srem_store_div:
1869 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1871 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1872 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1873 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1874 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1875 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1876 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1877 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1878 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1879 ; GFX10-NEXT: v_bfe_i32 v1, v4, 0, 8
1880 ; GFX10-NEXT: v_bfe_i32 v2, v4, 16, 8
1881 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1882 ; GFX10-NEXT: v_ashrrev_i32_e32 v10, 24, v9
1883 ; GFX10-NEXT: v_bfe_i32 v11, v4, 8, 8
1884 ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 24, v4
1885 ; GFX10-NEXT: v_bfe_i32 v13, v9, 16, 8
1886 ; GFX10-NEXT: v_xor_b32_e32 v14, v2, v1
1887 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
1888 ; GFX10-NEXT: v_xor_b32_e32 v16, v10, v11
1889 ; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11
1890 ; GFX10-NEXT: v_cvt_f32_i32_e32 v15, v2
1891 ; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10
1892 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v1
1893 ; GFX10-NEXT: v_cvt_f32_i32_e32 v17, v12
1894 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11
1895 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15
1896 ; GFX10-NEXT: v_xor_b32_e32 v2, v12, v2
1897 ; GFX10-NEXT: v_xor_b32_e32 v12, v13, v12
1898 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v21, v17
1899 ; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14
1900 ; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v13
1901 ; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16
1902 ; GFX10-NEXT: v_mul_f32_e32 v18, v15, v18
1903 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 30, v2
1904 ; GFX10-NEXT: v_mul_f32_e32 v19, v10, v19
1905 ; GFX10-NEXT: v_mul_f32_e32 v20, v17, v20
1906 ; GFX10-NEXT: v_or_b32_e32 v14, 1, v14
1907 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18
1908 ; GFX10-NEXT: v_mul_f32_e32 v21, v13, v21
1909 ; GFX10-NEXT: v_trunc_f32_e32 v19, v19
1910 ; GFX10-NEXT: v_trunc_f32_e32 v20, v20
1911 ; GFX10-NEXT: v_or_b32_e32 v16, 1, v16
1912 ; GFX10-NEXT: v_mad_f32 v22, -v18, v1, v15
1913 ; GFX10-NEXT: v_trunc_f32_e32 v21, v21
1914 ; GFX10-NEXT: v_mad_f32 v10, -v19, v11, v10
1915 ; GFX10-NEXT: v_mad_f32 v23, -v20, v15, v17
1916 ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12
1917 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1|
1918 ; GFX10-NEXT: v_or_b32_e32 v2, 1, v2
1919 ; GFX10-NEXT: v_mad_f32 v13, -v21, v17, v13
1920 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
1921 ; GFX10-NEXT: v_or_b32_e32 v12, 1, v12
1922 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v14, vcc_lo
1923 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v10|, |v11|
1924 ; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19
1925 ; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20
1926 ; GFX10-NEXT: v_cvt_i32_f32_e32 v21, v21
1927 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1928 ; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc_lo
1929 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v15|
1930 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v4
1931 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1
1932 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v19, v10
1933 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
1934 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v13|, |v17|
1935 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4
1936 ; GFX10-NEXT: v_mul_lo_u32 v3, v10, v3
1937 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v20, v2
1938 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo
1939 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4
1940 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0
1941 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v21, v11
1942 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1
1943 ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1944 ; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12
1945 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v2
1946 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1947 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1948 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1949 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1950 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306
1951 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1952 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1953 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1955 ; GFX9-LABEL: srem_store_div:
1957 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1959 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1960 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1961 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1962 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1963 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1964 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
1965 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1966 ; GFX9-NEXT: s_mov_b32 s4, 0x2070306
1967 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1968 ; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 8
1969 ; GFX9-NEXT: v_bfe_i32 v3, v4, 16, 8
1970 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1971 ; GFX9-NEXT: v_ashrrev_i32_e32 v11, 24, v9
1972 ; GFX9-NEXT: v_bfe_i32 v12, v4, 8, 8
1973 ; GFX9-NEXT: v_xor_b32_e32 v16, v3, v2
1974 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
1975 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 24, v4
1976 ; GFX9-NEXT: v_xor_b32_e32 v18, v11, v12
1977 ; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v12
1978 ; GFX9-NEXT: v_cvt_f32_i32_e32 v17, v3
1979 ; GFX9-NEXT: v_cvt_f32_i32_e32 v19, v13
1980 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v2
1981 ; GFX9-NEXT: v_bfe_i32 v15, v9, 16, 8
1982 ; GFX9-NEXT: v_cvt_f32_i32_e32 v11, v11
1983 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v12
1984 ; GFX9-NEXT: v_xor_b32_e32 v3, v13, v3
1985 ; GFX9-NEXT: v_xor_b32_e32 v13, v15, v13
1986 ; GFX9-NEXT: v_cvt_f32_i32_e32 v15, v15
1987 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v17
1988 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v19
1989 ; GFX9-NEXT: v_mul_f32_e32 v20, v17, v20
1990 ; GFX9-NEXT: v_mul_f32_e32 v21, v11, v21
1991 ; GFX9-NEXT: v_trunc_f32_e32 v20, v20
1992 ; GFX9-NEXT: v_ashrrev_i32_e32 v16, 30, v16
1993 ; GFX9-NEXT: v_mul_f32_e32 v22, v19, v22
1994 ; GFX9-NEXT: v_mul_f32_e32 v23, v15, v23
1995 ; GFX9-NEXT: v_trunc_f32_e32 v21, v21
1996 ; GFX9-NEXT: v_mad_f32 v24, -v20, v2, v17
1997 ; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18
1998 ; GFX9-NEXT: v_or_b32_e32 v16, 1, v16
1999 ; GFX9-NEXT: v_trunc_f32_e32 v22, v22
2000 ; GFX9-NEXT: v_trunc_f32_e32 v23, v23
2001 ; GFX9-NEXT: v_mad_f32 v11, -v21, v12, v11
2002 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v2|
2003 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v3
2004 ; GFX9-NEXT: v_or_b32_e32 v18, 1, v18
2005 ; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20
2006 ; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21
2007 ; GFX9-NEXT: v_mad_f32 v25, -v22, v17, v19
2008 ; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22
2009 ; GFX9-NEXT: v_mad_f32 v15, -v23, v19, v15
2010 ; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23
2011 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
2012 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12|
2013 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 30, v13
2014 ; GFX9-NEXT: v_or_b32_e32 v3, 1, v3
2015 ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v18, vcc
2016 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v17|
2017 ; GFX9-NEXT: v_or_b32_e32 v13, 1, v13
2018 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
2019 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, |v19|
2020 ; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc
2021 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
2022 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4
2023 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v4
2024 ; GFX9-NEXT: v_add_u32_e32 v2, v20, v2
2025 ; GFX9-NEXT: v_add_u32_e32 v11, v21, v11
2026 ; GFX9-NEXT: v_add_u32_e32 v3, v22, v3
2027 ; GFX9-NEXT: v_add_u32_e32 v12, v23, v12
2028 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
2029 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4
2030 ; GFX9-NEXT: v_mul_lo_u32 v4, v11, v10
2031 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0
2032 ; GFX9-NEXT: v_mul_lo_u32 v10, v12, v14
2033 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
2034 ; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2035 ; GFX9-NEXT: v_sub_u32_e32 v3, v14, v3
2036 ; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2037 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2038 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2039 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2040 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2041 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
2042 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2043 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2044 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2045 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2046 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2047 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2048 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2049 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 2>
2050 %vecins = srem <4 x i8> %shuffle0_0, %vec1
2051 store <4 x i8> %vecins, ptr addrspace(1) %out0
2052 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2057 define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2058 ; GFX10-LABEL: sub_store_div:
2060 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2061 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2062 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2063 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2064 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2065 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2066 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2067 ; GFX10-NEXT: global_load_dword v2, v[2:3], off
2068 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2069 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2070 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
2071 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2
2072 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v2
2073 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2074 ; GFX10-NEXT: v_sub_nc_u16 v3, v0, v3
2075 ; GFX10-NEXT: v_sub_nc_u16 v9, v1, v4
2076 ; GFX10-NEXT: v_sub_nc_u16 v10, v4, v2
2077 ; GFX10-NEXT: v_sub_nc_u16 v1, v4, v1
2078 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x6070007
2079 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
2080 ; GFX10-NEXT: v_lshlrev_b16 v4, 8, v9
2081 ; GFX10-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2082 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2083 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2084 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2085 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2086 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2088 ; GFX9-LABEL: sub_store_div:
2090 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2091 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2092 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2093 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2094 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2095 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2096 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
2097 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2098 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2099 ; GFX9-NEXT: s_mov_b32 s4, 0x6070007
2100 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2101 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
2102 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2
2103 ; GFX9-NEXT: v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2104 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2105 ; GFX9-NEXT: v_perm_b32 v4, v2, v0, s4
2106 ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
2107 ; GFX9-NEXT: v_sub_u16_e32 v2, v3, v2
2108 ; GFX9-NEXT: v_sub_u16_e32 v1, v3, v1
2109 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2110 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2111 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2112 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2113 ; GFX9-NEXT: global_store_dword v[7:8], v4, off
2114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2115 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2116 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2117 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2118 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2119 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2120 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2121 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6>
2122 %vecins = sub <4 x i8> %shuffle0_0, %vec1
2123 store <4 x i8> %vecins, ptr addrspace(1) %out0
2124 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2129 define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
2130 ; GFX10-LABEL: sv_store_div:
2132 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2133 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2134 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2135 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2136 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2137 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2138 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2139 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2140 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
2141 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2142 ; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x50705
2143 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2144 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2146 ; GFX9-LABEL: sv_store_div:
2148 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2150 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2151 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2152 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2153 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2154 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2155 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2156 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
2157 ; GFX9-NEXT: s_mov_b32 s4, 0x50705
2158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2159 ; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
2160 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
2161 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2162 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2163 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2164 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2165 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2166 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2167 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2168 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 1, i32 4>
2169 %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0>
2170 store <4 x i8> %insvec, ptr addrspace(1) %out1
2171 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2176 define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2177 ; GFX10-LABEL: trunc_store_div:
2179 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2180 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2181 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2182 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2183 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2184 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2185 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2186 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2187 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
2188 ; GFX10-NEXT: v_mov_b32_e32 v0, 1
2189 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2190 ; GFX10-NEXT: v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2191 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2192 ; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2193 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
2194 ; GFX10-NEXT: v_lshlrev_b16 v2, 2, v0
2195 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
2196 ; GFX10-NEXT: v_lshlrev_b16 v1, 3, v4
2197 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
2198 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
2199 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205
2200 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0
2201 ; GFX10-NEXT: global_store_byte v[7:8], v0, off
2202 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2203 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2205 ; GFX9-LABEL: trunc_store_div:
2207 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2208 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2209 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2210 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2211 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2212 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2213 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2214 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2215 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2216 ; GFX9-NEXT: v_mov_b32_e32 v0, 1
2217 ; GFX9-NEXT: s_mov_b32 s4, 0x50205
2218 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2219 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 3, v4
2220 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2221 ; GFX9-NEXT: v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2222 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2223 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
2224 ; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
2225 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 2, v2
2226 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
2227 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
2228 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
2229 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
2230 ; GFX9-NEXT: global_store_byte v[7:8], v0, off
2231 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
2232 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2233 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2234 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2235 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2236 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2237 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2238 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2239 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0>
2240 %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1>
2241 store <4 x i1> %insvec, ptr addrspace(1) %out1
2242 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2246 define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2247 ; GFX10-LABEL: udiv:
2249 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2250 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2251 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2252 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2253 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2254 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2255 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2256 ; GFX10-NEXT: global_load_dword v2, v[2:3], off
2257 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2258 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2259 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
2260 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2
2261 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2
2262 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2263 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v14, v0
2264 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2
2265 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1
2266 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3
2267 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9
2268 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0
2269 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4
2270 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207
2271 ; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10
2272 ; GFX10-NEXT: v_mul_f32_e32 v11, v4, v11
2273 ; GFX10-NEXT: v_mul_f32_e32 v13, v1, v13
2274 ; GFX10-NEXT: v_mul_f32_e32 v12, v15, v12
2275 ; GFX10-NEXT: v_trunc_f32_e32 v10, v10
2276 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11
2277 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13
2278 ; GFX10-NEXT: v_trunc_f32_e32 v12, v12
2279 ; GFX10-NEXT: v_mad_f32 v14, -v10, v1, v14
2280 ; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10
2281 ; GFX10-NEXT: v_mad_f32 v16, -v11, v3, v4
2282 ; GFX10-NEXT: v_mad_f32 v17, -v13, v9, v1
2283 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
2284 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, v1
2285 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13
2286 ; GFX10-NEXT: v_mad_f32 v15, -v12, v4, v15
2287 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12
2288 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2289 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v16|, v3
2290 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2291 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v17|, v9
2292 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
2293 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2294 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v4
2295 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2296 ; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9
2297 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2298 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2299 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2300 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2301 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2302 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2306 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2307 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2308 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2309 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2310 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2311 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2312 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2313 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
2314 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
2315 ; GFX9-NEXT: s_mov_b32 s4, 0x40207
2316 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2317 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
2318 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2
2319 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
2320 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3
2321 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2322 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9
2323 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4
2324 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10
2325 ; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11
2326 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
2327 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4
2328 ; GFX9-NEXT: v_trunc_f32_e32 v11, v11
2329 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4
2330 ; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12
2331 ; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1
2332 ; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11
2333 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9
2334 ; GFX9-NEXT: v_trunc_f32_e32 v12, v12
2335 ; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13
2336 ; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10
2337 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
2338 ; GFX9-NEXT: v_trunc_f32_e32 v13, v13
2339 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2
2340 ; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14
2341 ; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9
2342 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13
2343 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
2344 ; GFX9-NEXT: v_trunc_f32_e32 v14, v14
2345 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3
2346 ; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2
2347 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14
2348 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
2349 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10
2350 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
2351 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4
2352 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
2353 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
2354 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
2355 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2356 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2357 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2358 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
2359 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
2360 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2361 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2362 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2363 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2364 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2365 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2366 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2367 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4>
2368 %vecins = udiv <4 x i8> %shuffle0_0, %vec1
2369 store <4 x i8> %vecins, ptr addrspace(1) %out0
2370 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2375 define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2376 ; GFX10-LABEL: uitofp_store_div:
2378 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2379 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2380 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2381 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2382 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2383 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2384 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2385 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
2386 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
2387 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2388 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
2389 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2390 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v9
2391 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v9
2392 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
2393 ; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5020104
2394 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
2395 ; GFX10-NEXT: global_store_dword v[5:6], v4, off
2396 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2398 ; GFX9-LABEL: uitofp_store_div:
2400 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2401 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2402 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2403 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2404 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2405 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2406 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2407 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2408 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2409 ; GFX9-NEXT: s_mov_b32 s4, 0x5020104
2410 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2411 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
2412 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2413 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v9
2414 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
2415 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
2416 ; GFX9-NEXT: v_perm_b32 v10, v9, v4, s4
2417 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
2418 ; GFX9-NEXT: global_store_dword v[5:6], v10, off
2419 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2420 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2421 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2422 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2423 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2424 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2425 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2426 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 2, i32 5>
2427 %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float>
2428 store <4 x float> %insvec, ptr addrspace(1) %out1
2429 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2434 define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2435 ; GFX10-LABEL: urem_store_div:
2437 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2438 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2439 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2440 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2441 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2442 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2443 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2444 ; GFX10-NEXT: global_load_dword v2, v[2:3], off
2445 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2446 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2447 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
2448 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2
2449 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2
2450 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2
2451 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2452 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0
2453 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1
2454 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3
2455 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4
2456 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9
2457 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2
2458 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2
2459 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2
2460 ; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10
2461 ; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11
2462 ; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12
2463 ; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13
2464 ; GFX10-NEXT: v_trunc_f32_e32 v10, v10
2465 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11
2466 ; GFX10-NEXT: v_trunc_f32_e32 v12, v12
2467 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13
2468 ; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3
2469 ; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10
2470 ; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3
2471 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
2472 ; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3
2473 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
2474 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12
2475 ; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15
2476 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13
2477 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2478 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3
2479 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
2480 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2481 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
2482 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16
2483 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1
2484 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2485 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
2486 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14
2487 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2488 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2489 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2490 ; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17
2491 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4
2492 ; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2493 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505
2494 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2495 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2496 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2497 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2498 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2500 ; GFX9-LABEL: urem_store_div:
2502 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2504 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2505 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2506 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2507 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2508 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2509 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
2510 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
2511 ; GFX9-NEXT: s_mov_b32 s4, 0x2050505
2512 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2513 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
2514 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2
2515 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
2516 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v3
2517 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v4
2518 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v11
2519 ; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15
2520 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v14, v4
2521 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15
2522 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14
2523 ; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16
2524 ; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3
2525 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
2526 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16
2527 ; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17
2528 ; GFX9-NEXT: v_mad_f32 v20, -v16, v3, v3
2529 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16
2530 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2531 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9
2532 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17
2533 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2
2534 ; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18
2535 ; GFX9-NEXT: v_mad_f32 v21, -v17, v11, v3
2536 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17
2537 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v15, vcc
2538 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18
2539 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, v3
2540 ; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13
2541 ; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18
2542 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v16, vcc
2543 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v21|, v11
2544 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v17, vcc
2545 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14
2546 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
2547 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4
2548 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4
2549 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v18, vcc
2550 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
2551 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4
2552 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v10
2553 ; GFX9-NEXT: v_mul_lo_u32 v0, v11, v0
2554 ; GFX9-NEXT: v_mul_lo_u32 v4, v13, v12
2555 ; GFX9-NEXT: v_sub_u32_e32 v2, v10, v2
2556 ; GFX9-NEXT: v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2557 ; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0
2558 ; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2559 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2560 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2561 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2562 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2563 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
2564 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2565 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2566 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2567 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2568 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2569 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2570 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2571 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 2>
2572 %vecins = urem <4 x i8> %shuffle0_0, %vec1
2573 store <4 x i8> %vecins, ptr addrspace(1) %out0
2574 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2579 define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2580 ; GFX10-LABEL: xor_store_div:
2582 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2583 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2584 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2585 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2586 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2587 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2588 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2589 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2590 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
2591 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00
2592 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
2593 ; GFX10-NEXT: v_mov_b32_e32 v2, 2
2594 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2595 ; GFX10-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2596 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2597 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
2598 ; GFX10-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2599 ; GFX10-NEXT: v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2600 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x200, v0
2601 ; GFX10-NEXT: v_xor_b32_e32 v3, 0x100, v3
2602 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
2603 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2604 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2605 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5060307
2606 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
2607 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
2608 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2610 ; GFX9-LABEL: xor_store_div:
2612 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2613 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2614 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2615 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2616 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2617 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2618 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2619 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2620 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2621 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
2622 ; GFX9-NEXT: v_mov_b32_e32 v0, 1
2623 ; GFX9-NEXT: v_mov_b32_e32 v1, 2
2624 ; GFX9-NEXT: s_mov_b32 s5, 0x5060307
2625 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2626 ; GFX9-NEXT: v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2628 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
2629 ; GFX9-NEXT: v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2630 ; GFX9-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2631 ; GFX9-NEXT: v_xor_b32_e32 v2, 0x200, v2
2632 ; GFX9-NEXT: v_xor_b32_e32 v3, 0x100, v3
2633 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
2634 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2635 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2636 ; GFX9-NEXT: v_perm_b32 v4, v9, v4, s5
2637 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2638 ; GFX9-NEXT: global_store_dword v[7:8], v4, off
2639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2640 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2641 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2642 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2643 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2644 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2645 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2646 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 3, i32 6, i32 5>
2647 %vecins = xor <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
2648 store <4 x i8> %vecins, ptr addrspace(1) %out0
2649 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2654 define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2655 ; GFX10-LABEL: zext_store_div:
2657 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2658 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2659 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2660 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2661 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2662 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2663 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2664 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2665 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
2666 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
2667 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2668 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4
2669 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v4
2670 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2671 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v9
2672 ; GFX10-NEXT: v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2673 ; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x5040100
2674 ; GFX10-NEXT: v_perm_b32 v2, v4, v9, 0x60504
2675 ; GFX10-NEXT: v_perm_b32 v1, v3, v10, 0x5040100
2676 ; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
2677 ; GFX10-NEXT: global_store_dword v[5:6], v2, off
2678 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2680 ; GFX9-LABEL: zext_store_div:
2682 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2683 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2684 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2685 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2686 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2687 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2688 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2689 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2690 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2691 ; GFX9-NEXT: s_mov_b32 s4, 0x60504
2692 ; GFX9-NEXT: s_movk_i32 s5, 0xff
2693 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100
2694 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2695 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4
2696 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2697 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
2698 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v4
2699 ; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v9
2700 ; GFX9-NEXT: v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2701 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6
2702 ; GFX9-NEXT: v_perm_b32 v1, v3, v4, s6
2703 ; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
2704 ; GFX9-NEXT: global_store_dword v[5:6], v2, off
2705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2706 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2707 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2708 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2709 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2710 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2711 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2712 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
2713 %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>
2714 store <4 x i16> %insvec, ptr addrspace(1) %out1
2715 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2719 define void @Source16Bit(i16 %in, <2 x i16> %reg) {
2720 ; GFX10-LABEL: Source16Bit:
2721 ; GFX10: ; %bb.0: ; %entry
2722 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2723 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
2724 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
2725 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2727 ; GFX9-LABEL: Source16Bit:
2728 ; GFX9: ; %bb.0: ; %entry
2729 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2730 ; GFX9-NEXT: s_mov_b32 s4, 0x3050204
2731 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
2732 ; GFX9-NEXT: global_store_dword v[0:1], v0, off
2733 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2734 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2736 %elt0 = extractelement <2 x i16> %reg, i32 1
2737 %e0b0 = and i16 %elt0, 255
2738 %e0b1 = and i16 %elt0, -256
2739 %e1b0 = and i16 %in, 255
2740 %e1b1 = and i16 %in, -256
2741 %tmp0 = shl i16 %e0b0, 8
2742 %byte0 = or i16 %tmp0, %e1b0
2743 %tmp2 = lshr i16 %e1b1, 8
2744 %byte1 = or i16 %e0b1, %tmp2
2745 %ext0 = zext i16 %byte0 to i32
2746 %ext1 = zext i16 %byte1 to i32
2747 %shifted = shl i32 %ext1, 16
2748 %result = or i32 %shifted, %ext0
2749 store i32 %result, ptr addrspace(1) undef
2753 define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2754 ; GFX10-LABEL: extract3744:
2756 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2757 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2758 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2759 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2760 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2761 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2762 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2764 ; GFX9-LABEL: extract3744:
2766 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2767 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2768 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2769 ; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2770 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2771 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
2772 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2773 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2774 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2775 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2776 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2777 %v1e0 = extractelement <4 x i8> %vec1, i64 0
2778 %zv1e0 = zext i8 %v1e0 to i32
2779 %byte1 = shl i32 %zv1e0, 8
2781 %v1e3 = extractelement <4 x i8> %vec1, i64 3
2782 %zv1e3 = zext i8 %v1e3 to i32
2783 %byte2 = shl i32 %zv1e3, 16
2784 %v2e3 = extractelement <4 x i8> %vec2, i64 3
2785 %zv2e3 = zext i8 %v2e3 to i32
2786 %byte3 = shl i32 %zv2e3, 24
2788 %tmp0 = or i32 %zv1e0, %byte1
2789 %tmp1 = or i32 %tmp0, %byte2
2790 %res = or i32 %tmp1, %byte3
2791 store i32 %res, ptr addrspace(1) %out0, align 4
2795 declare i32 @llvm.amdgcn.perm(i32, i32, i32)
2797 define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2798 ; GFX10-LABEL: extract_perm_3744:
2800 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2801 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2802 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2803 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2804 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2805 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2806 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2808 ; GFX9-LABEL: extract_perm_3744:
2810 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2811 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2812 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2813 ; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2814 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2815 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
2816 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2817 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2818 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2819 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2820 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2821 %cast1 = bitcast <4 x i8> %vec1 to i32
2822 %cast2 = bitcast <4 x i8> %vec2 to i32
2823 %lo24 = call i32 @llvm.amdgcn.perm(i32 %cast1, i32 %cast1, i32 201523200)
2824 %hi8 = call i32 @llvm.amdgcn.perm(i32 %cast2, i32 %cast2, i32 51121164)
2825 %res = or i32 %hi8, %lo24
2826 store i32 %res, ptr addrspace(1) %out0, align 4
2830 define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2831 ; GFX10-LABEL: extract1347_v2i16:
2833 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2834 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2835 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2836 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2837 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407
2838 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2839 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2841 ; GFX9-LABEL: extract1347_v2i16:
2843 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2844 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2845 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2846 ; GFX9-NEXT: s_mov_b32 s4, 0x1030407
2847 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2848 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2849 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2850 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2851 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2852 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2853 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2854 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2855 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2856 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2857 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2859 %b0t0 = and i16 -256, %v2e1
2860 %b0t1 = lshr i16 %b0t0, 8
2861 %byte0 = zext i16 %b0t1 to i32
2863 %b1t0 = and i16 255, %v2e0
2864 %b1t1 = zext i16 %b1t0 to i32
2865 %byte1 = shl i32 %b1t1, 8
2867 %b2t0 = and i16 -256, %v1e1
2868 %b2t1 = lshr i16 %b2t0, 8
2869 %b2t2 = zext i16 %b2t1 to i32
2870 %byte2 = shl i32 %b2t2, 16
2872 %b3t0 = and i16 -256, %v1e0
2873 %b3t1 = lshr i16 %b3t0, 8
2874 %b3t2 = zext i16 %b3t1 to i32
2875 %byte3 = shl i32 %b3t2, 24
2877 %tmp0 = or i32 %byte0, %byte1
2878 %tmp1 = or i32 %tmp0, %byte2
2879 %res = or i32 %tmp1, %byte3
2880 store i32 %res, ptr addrspace(1) %out0, align 4
2885 declare i16 @llvm.fshr.i16(i16, i16, i16)
2887 define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2888 ; GFX10-LABEL: fshri16_8:
2890 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2892 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2893 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2894 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
2895 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2896 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2898 ; GFX9-LABEL: fshri16_8:
2900 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2901 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2902 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2903 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
2904 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2905 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2906 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2907 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2908 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2909 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2910 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2911 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2912 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2913 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2914 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2916 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
2917 %byte01 = zext i16 %tmp01.0 to i32
2919 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
2920 %tmp23.1 = zext i16 %tmp23.0 to i32
2921 %byte23 = shl i32 %tmp23.1, 16
2922 %res = or i32 %byte01, %byte23
2923 store i32 %res, ptr addrspace(1) %out0, align 4
2927 define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2928 ; GFX10-LABEL: fshri16_16:
2930 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2931 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2932 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2933 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2934 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
2935 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2936 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2938 ; GFX9-LABEL: fshri16_16:
2940 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2941 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2942 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2943 ; GFX9-NEXT: s_mov_b32 s4, 0x3020706
2944 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2945 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2946 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2947 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2948 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2949 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2950 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2951 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2952 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2953 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2954 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2956 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
2957 %byte01 = zext i16 %tmp01.0 to i32
2959 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
2960 %tmp23.1 = zext i16 %tmp23.0 to i32
2961 %byte23 = shl i32 %tmp23.1, 16
2962 %res = or i32 %byte01, %byte23
2963 store i32 %res, ptr addrspace(1) %out0, align 4
2967 define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2968 ; GFX10-LABEL: fshri16_24:
2970 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2971 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2972 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2973 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2974 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
2975 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2976 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2978 ; GFX9-LABEL: fshri16_24:
2980 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2981 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2982 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2983 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
2984 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2985 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2986 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2987 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2988 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2989 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2990 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2991 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2992 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2993 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2994 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2996 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
2997 %byte01 = zext i16 %tmp01.0 to i32
2999 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
3000 %tmp23.1 = zext i16 %tmp23.0 to i32
3001 %byte23 = shl i32 %tmp23.1, 16
3002 %res = or i32 %byte01, %byte23
3003 store i32 %res, ptr addrspace(1) %out0, align 4
3007 define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3008 ; GFX10-LABEL: fshri16_32:
3010 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3011 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3012 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3013 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3014 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
3015 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3016 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3018 ; GFX9-LABEL: fshri16_32:
3020 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3021 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3022 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3023 ; GFX9-NEXT: s_mov_b32 s4, 0x3020706
3024 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3025 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3026 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3027 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3028 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3029 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3030 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3031 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3032 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3033 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3034 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3036 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
3037 %byte01 = zext i16 %tmp01.0 to i32
3039 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
3040 %tmp23.1 = zext i16 %tmp23.0 to i32
3041 %byte23 = shl i32 %tmp23.1, 16
3042 %res = or i32 %byte01, %byte23
3043 store i32 %res, ptr addrspace(1) %out0, align 4
3047 define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3048 ; GFX10-LABEL: fshri16_88:
3050 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3051 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3052 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3053 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3054 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3055 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3056 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3058 ; GFX9-LABEL: fshri16_88:
3060 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3061 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3062 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3063 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3064 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3065 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3066 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3067 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3068 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3069 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3070 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3071 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3072 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3073 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3074 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3076 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
3077 %byte01 = zext i16 %tmp01.0 to i32
3079 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
3080 %tmp23.1 = zext i16 %tmp23.0 to i32
3081 %byte23 = shl i32 %tmp23.1, 16
3082 %res = or i32 %byte01, %byte23
3083 store i32 %res, ptr addrspace(1) %out0, align 4
3087 declare i16 @llvm.fshl.i16(i16, i16, i16)
3089 define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3090 ; GFX10-LABEL: fshli16_1347:
3092 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3093 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3094 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3095 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3096 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3097 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3098 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3100 ; GFX9-LABEL: fshli16_1347:
3102 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3103 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3104 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3105 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3107 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3108 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3109 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3110 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3111 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3112 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3113 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3114 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3115 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3116 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3118 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
3119 %byte01 = zext i16 %tmp01.0 to i32
3121 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
3122 %tmp23.1 = zext i16 %tmp23.0 to i32
3123 %byte23 = shl i32 %tmp23.1, 16
3124 %res = or i32 %byte01, %byte23
3125 store i32 %res, ptr addrspace(1) %out0, align 4
3129 define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3130 ; GFX10-LABEL: fshli16_16:
3132 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3133 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3134 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3135 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3136 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
3137 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3138 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3140 ; GFX9-LABEL: fshli16_16:
3142 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3143 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3144 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3145 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3146 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3147 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3148 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3149 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3150 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3151 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3152 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3153 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3154 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3155 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3156 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3158 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
3159 %byte01 = zext i16 %tmp01.0 to i32
3161 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
3162 %tmp23.1 = zext i16 %tmp23.0 to i32
3163 %byte23 = shl i32 %tmp23.1, 16
3164 %res = or i32 %byte01, %byte23
3165 store i32 %res, ptr addrspace(1) %out0, align 4
3169 define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3170 ; GFX10-LABEL: fshli16_24:
3172 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3173 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3174 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3175 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3176 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3177 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3178 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3180 ; GFX9-LABEL: fshli16_24:
3182 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3183 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3184 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3185 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3186 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3187 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3188 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3190 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3191 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3192 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3193 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3194 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3195 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3196 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3198 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
3199 %byte01 = zext i16 %tmp01.0 to i32
3201 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
3202 %tmp23.1 = zext i16 %tmp23.0 to i32
3203 %byte23 = shl i32 %tmp23.1, 16
3204 %res = or i32 %byte01, %byte23
3205 store i32 %res, ptr addrspace(1) %out0, align 4
3209 define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3210 ; GFX10-LABEL: fshli16_32:
3212 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3213 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3214 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3215 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3216 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
3217 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3218 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3220 ; GFX9-LABEL: fshli16_32:
3222 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3223 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3224 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3225 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3226 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3227 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3228 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3229 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3230 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3231 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3232 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3233 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3234 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3235 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3236 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3238 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
3239 %byte01 = zext i16 %tmp01.0 to i32
3241 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
3242 %tmp23.1 = zext i16 %tmp23.0 to i32
3243 %byte23 = shl i32 %tmp23.1, 16
3244 %res = or i32 %byte01, %byte23
3245 store i32 %res, ptr addrspace(1) %out0, align 4
3249 define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3250 ; GFX10-LABEL: fshli16_88:
3252 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3253 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3254 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3255 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3256 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3257 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3258 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3260 ; GFX9-LABEL: fshli16_88:
3262 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3263 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3264 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3265 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3266 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3267 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3268 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3269 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3270 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3271 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3272 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3273 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3274 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3275 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3276 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3278 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
3279 %byte01 = zext i16 %tmp01.0 to i32
3281 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
3282 %tmp23.1 = zext i16 %tmp23.0 to i32
3283 %byte23 = shl i32 %tmp23.1, 16
3284 %res = or i32 %byte01, %byte23
3285 store i32 %res, ptr addrspace(1) %out0, align 4
3289 define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
3290 ; GFX10-LABEL: shlbase:
3292 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3293 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3294 ; GFX10-NEXT: global_load_dword v8, v[2:3], off
3295 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6
3296 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v6
3297 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 8, v6
3298 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3299 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v7
3300 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3301 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3302 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3303 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, v3, v2
3304 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
3305 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3306 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3308 ; GFX9-LABEL: shlbase:
3310 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3311 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3312 ; GFX9-NEXT: global_load_dword v8, v[2:3], off
3313 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v6
3314 ; GFX9-NEXT: v_add_u32_e32 v1, 16, v6
3315 ; GFX9-NEXT: v_add_u32_e32 v2, 24, v6
3316 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3317 ; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v7
3318 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3319 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3320 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3321 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, v0, v3
3322 ; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
3323 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3325 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3326 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
3327 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
3328 %v1e0 = extractelement <4 x i8> %vec1, i64 0
3329 %zv1e0 = zext i8 %v1e0 to i32
3330 %b8 = add i32 %base, 8
3331 %byte1 = shl i32 %zv1e0, %b8
3333 %v1e3 = extractelement <4 x i8> %vec1, i64 3
3334 %zv1e3 = zext i8 %v1e3 to i32
3335 %b16 = add i32 %base, 16
3336 %byte2 = shl i32 %zv1e3, %b16
3337 %v2e3 = extractelement <4 x i8> %vec2, i64 3
3338 %zv2e3 = zext i8 %v2e3 to i32
3339 %b24 = add i32 %base, 24
3340 %byte3 = shl i32 %zv2e3, %b24
3342 %tmp0 = or i32 %zv1e0, %byte1
3343 %tmp1 = or i32 %tmp0, %byte2
3344 %res = or i32 %tmp1, %byte3
3345 store i32 %res, ptr addrspace(1) %out0, align 4
3349 ; TODO -- lower into v_perm
3350 define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) {
3351 ; GFX10-LABEL: extractbase:
3353 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3354 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3355 ; GFX10-NEXT: global_load_dword v8, v[2:3], off
3356 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6
3357 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v0
3358 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3359 ; GFX10-NEXT: v_bfe_u32 v2, v7, v1, 8
3360 ; GFX10-NEXT: v_bfe_u32 v0, v7, v0, 8
3361 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3362 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3363 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3364 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v0
3365 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
3366 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3367 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3369 ; GFX9-LABEL: extractbase:
3371 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3372 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3373 ; GFX9-NEXT: global_load_dword v8, v[2:3], off
3374 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v6
3375 ; GFX9-NEXT: v_add_u32_e32 v1, 24, v0
3376 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3377 ; GFX9-NEXT: v_bfe_u32 v0, v7, v0, 8
3378 ; GFX9-NEXT: v_bfe_u32 v2, v7, v1, 8
3379 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3380 ; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3381 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3382 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
3383 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
3384 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3385 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3386 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3387 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
3388 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
3389 %v1b = extractelement <4 x i8> %vec1, i64 %base
3390 %zv1b = zext i8 %v1b to i32
3391 %byte1 = shl i32 %zv1b, 8
3393 %b3 = add i64 %base, 3
3394 %v1b3 = extractelement <4 x i8> %vec1, i64 %b3
3395 %zv1b3 = zext i8 %v1b3 to i32
3396 %byte2 = shl i32 %zv1b3, 16
3397 %v2b3 = extractelement <4 x i8> %vec2, i64 %b3
3398 %zv2b3 = zext i8 %v2b3 to i32
3399 %byte3 = shl i32 %zv2b3, 24
3401 %tmp0 = or i32 %zv1b, %byte1
3402 %tmp1 = or i32 %tmp0, %byte2
3403 %res = or i32 %tmp1, %byte3
3404 store i32 %res, ptr addrspace(1) %out0, align 4