1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
5 define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
6 ; GFX10-LABEL: shuffle6766:
8 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; GFX10-NEXT: global_load_dword v0, v[2:3], off
10 ; GFX10-NEXT: s_waitcnt vmcnt(0)
11 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706
12 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
13 ; GFX10-NEXT: s_setpc_b64 s[30:31]
15 ; GFX9-LABEL: shuffle6766:
17 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18 ; GFX9-NEXT: global_load_dword v0, v[2:3], off
19 ; GFX9-NEXT: s_mov_b32 s4, 0x6060706
20 ; GFX9-NEXT: s_waitcnt vmcnt(0)
21 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
22 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
23 ; GFX9-NEXT: s_waitcnt vmcnt(0)
24 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
26 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
27 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 6, i32 6>
28 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
32 define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
33 ; GFX10-LABEL: shuffle3744:
35 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
37 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
38 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x307
40 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
41 ; GFX10-NEXT: s_setpc_b64 s[30:31]
43 ; GFX9-LABEL: shuffle3744:
45 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
47 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
48 ; GFX9-NEXT: s_movk_i32 s4, 0x307
49 ; GFX9-NEXT: s_waitcnt vmcnt(0)
50 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
51 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
52 ; GFX9-NEXT: s_waitcnt vmcnt(0)
53 ; GFX9-NEXT: s_setpc_b64 s[30:31]
54 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
55 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
56 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 7, i32 4, i32 4>
57 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
61 define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
62 ; GFX10-LABEL: shuffle4445:
64 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX10-NEXT: global_load_dword v0, v[2:3], off
66 ; GFX10-NEXT: s_waitcnt vmcnt(0)
67 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040404
68 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
69 ; GFX10-NEXT: s_setpc_b64 s[30:31]
71 ; GFX9-LABEL: shuffle4445:
73 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; GFX9-NEXT: global_load_dword v0, v[2:3], off
75 ; GFX9-NEXT: s_mov_b32 s4, 0x5040404
76 ; GFX9-NEXT: s_waitcnt vmcnt(0)
77 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
78 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
79 ; GFX9-NEXT: s_waitcnt vmcnt(0)
80 ; GFX9-NEXT: s_setpc_b64 s[30:31]
81 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
82 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
83 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 5>
84 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
88 define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
89 ; GFX10-LABEL: shuffle0101:
91 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
93 ; GFX10-NEXT: s_waitcnt vmcnt(0)
94 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
95 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
96 ; GFX10-NEXT: s_setpc_b64 s[30:31]
98 ; GFX9-LABEL: shuffle0101:
100 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
102 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
104 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
105 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
107 ; GFX9-NEXT: s_setpc_b64 s[30:31]
108 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
109 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
110 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
111 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
115 define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
116 ; GFX10-LABEL: shuffle1004:
118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
120 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
121 ; GFX10-NEXT: s_waitcnt vmcnt(0)
122 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x40405
123 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
124 ; GFX10-NEXT: s_setpc_b64 s[30:31]
126 ; GFX9-LABEL: shuffle1004:
128 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
130 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
131 ; GFX9-NEXT: s_mov_b32 s4, 0x40405
132 ; GFX9-NEXT: s_waitcnt vmcnt(0)
133 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
134 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: s_setpc_b64 s[30:31]
137 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
138 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
139 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4>
140 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
146 define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
147 ; GFX10-LABEL: shuffle7533:
149 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150 ; GFX10-NEXT: flat_load_dword v6, v[0:1]
151 ; GFX10-NEXT: flat_load_dword v7, v[2:3]
152 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
153 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3030507
154 ; GFX10-NEXT: flat_store_dword v[4:5], v0
155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX10-NEXT: s_setpc_b64 s[30:31]
158 ; GFX9-LABEL: shuffle7533:
160 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX9-NEXT: flat_load_dword v6, v[0:1]
162 ; GFX9-NEXT: flat_load_dword v7, v[2:3]
163 ; GFX9-NEXT: s_mov_b32 s4, 0x3030507
164 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
165 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
166 ; GFX9-NEXT: flat_store_dword v[4:5], v0
167 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
168 ; GFX9-NEXT: s_setpc_b64 s[30:31]
169 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
170 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
171 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
172 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
176 define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
177 ; GFX10-LABEL: shuffle7767:
179 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180 ; GFX10-NEXT: flat_load_dword v0, v[2:3]
181 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
182 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060707
183 ; GFX10-NEXT: flat_store_dword v[4:5], v0
184 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX10-NEXT: s_setpc_b64 s[30:31]
187 ; GFX9-LABEL: shuffle7767:
189 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; GFX9-NEXT: flat_load_dword v0, v[2:3]
191 ; GFX9-NEXT: s_mov_b32 s4, 0x7060707
192 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
193 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
194 ; GFX9-NEXT: flat_store_dword v[4:5], v0
195 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
196 ; GFX9-NEXT: s_setpc_b64 s[30:31]
197 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
198 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
199 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
200 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
204 define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
205 ; GFX10-LABEL: shuffle0554:
207 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208 ; GFX10-NEXT: ds_read_b32 v0, v0
209 ; GFX10-NEXT: ds_read_b32 v1, v1
210 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104
212 ; GFX10-NEXT: ds_write_b32 v2, v0
213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX10-NEXT: s_setpc_b64 s[30:31]
216 ; GFX9-LABEL: shuffle0554:
218 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; GFX9-NEXT: ds_read_b32 v0, v0
220 ; GFX9-NEXT: ds_read_b32 v1, v1
221 ; GFX9-NEXT: s_mov_b32 s4, 0x10104
222 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
224 ; GFX9-NEXT: ds_write_b32 v2, v0
225 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
226 ; GFX9-NEXT: s_setpc_b64 s[30:31]
227 %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
228 %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
229 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4>
230 store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
234 define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
235 ; GFX10-LABEL: shuffle2127:
237 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GFX10-NEXT: ds_read_b32 v0, v0
239 ; GFX10-NEXT: ds_read_b32 v1, v1
240 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3060506
242 ; GFX10-NEXT: ds_write_b32 v2, v0
243 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX10-NEXT: s_setpc_b64 s[30:31]
246 ; GFX9-LABEL: shuffle2127:
248 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249 ; GFX9-NEXT: ds_read_b32 v0, v0
250 ; GFX9-NEXT: ds_read_b32 v1, v1
251 ; GFX9-NEXT: s_mov_b32 s4, 0x3060506
252 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
254 ; GFX9-NEXT: ds_write_b32 v2, v0
255 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX9-NEXT: s_setpc_b64 s[30:31]
257 %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
258 %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
259 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 1, i32 2, i32 7>
260 store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
264 define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) {
265 ; GFX10-LABEL: shuffle5047:
267 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268 ; GFX10-NEXT: s_clause 0x1
269 ; GFX10-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
270 ; GFX10-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen
271 ; GFX10-NEXT: s_waitcnt vmcnt(0)
272 ; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0x7040005
273 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
274 ; GFX10-NEXT: s_setpc_b64 s[30:31]
276 ; GFX9-LABEL: shuffle5047:
278 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279 ; GFX9-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
280 ; GFX9-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen
281 ; GFX9-NEXT: s_mov_b32 s4, 0x7040005
282 ; GFX9-NEXT: s_waitcnt vmcnt(0)
283 ; GFX9-NEXT: v_perm_b32 v0, v4, v3, s4
284 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_setpc_b64 s[30:31]
287 %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4
288 %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4
289 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7>
290 store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4
294 define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
295 ; GFX10-LABEL: shuffle3546:
297 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
299 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
300 ; GFX10-NEXT: s_waitcnt vmcnt(0)
301 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2000107
302 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
303 ; GFX10-NEXT: s_setpc_b64 s[30:31]
305 ; GFX9-LABEL: shuffle3546:
307 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
309 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
310 ; GFX9-NEXT: s_mov_b32 s4, 0x2000107
311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
312 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
313 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
314 ; GFX9-NEXT: s_waitcnt vmcnt(0)
315 ; GFX9-NEXT: s_setpc_b64 s[30:31]
316 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
317 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
318 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 5, i32 4, i32 6>
319 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
324 define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
325 ; GFX10-LABEL: shuffle7330ud2:
327 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
329 ; GFX10-NEXT: s_waitcnt vmcnt(0)
330 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706
331 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
332 ; GFX10-NEXT: s_setpc_b64 s[30:31]
334 ; GFX9-LABEL: shuffle7330ud2:
336 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
338 ; GFX9-NEXT: s_mov_b32 s4, 0x4070706
339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
340 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
341 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
342 ; GFX9-NEXT: s_waitcnt vmcnt(0)
343 ; GFX9-NEXT: s_setpc_b64 s[30:31]
344 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
345 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0>
346 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
350 define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
351 ; GFX10-LABEL: shuffle5341ud2:
353 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
355 ; GFX10-NEXT: s_waitcnt vmcnt(0)
356 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
357 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
358 ; GFX10-NEXT: s_setpc_b64 s[30:31]
360 ; GFX9-LABEL: shuffle5341ud2:
362 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
365 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16
366 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
367 ; GFX9-NEXT: s_waitcnt vmcnt(0)
368 ; GFX9-NEXT: s_setpc_b64 s[30:31]
369 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
370 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 5, i32 3, i32 4, i32 1>
371 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
375 define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
376 ; GFX10-LABEL: shuffle6106ud2:
378 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
380 ; GFX10-NEXT: s_waitcnt vmcnt(0)
381 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
382 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
383 ; GFX10-NEXT: s_setpc_b64 s[30:31]
385 ; GFX9-LABEL: shuffle6106ud2:
387 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
389 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
390 ; GFX9-NEXT: s_waitcnt vmcnt(0)
391 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
392 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
393 ; GFX9-NEXT: s_waitcnt vmcnt(0)
394 ; GFX9-NEXT: s_setpc_b64 s[30:31]
395 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
396 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6>
397 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
402 define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
403 ; GFX10-LABEL: shuffle4327ud2:
405 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
407 ; GFX10-NEXT: s_waitcnt vmcnt(0)
408 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
409 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
410 ; GFX10-NEXT: s_setpc_b64 s[30:31]
412 ; GFX9-LABEL: shuffle4327ud2:
414 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
416 ; GFX9-NEXT: s_mov_b32 s4, 0x7060706
417 ; GFX9-NEXT: s_waitcnt vmcnt(0)
418 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
419 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
421 ; GFX9-NEXT: s_setpc_b64 s[30:31]
422 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
423 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 4, i32 3, i32 2, i32 7>
424 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
428 define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
429 ; GFX10-LABEL: shuffle3263ud2:
431 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
433 ; GFX10-NEXT: s_waitcnt vmcnt(0)
434 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607
435 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
436 ; GFX10-NEXT: s_setpc_b64 s[30:31]
438 ; GFX9-LABEL: shuffle3263ud2:
440 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
442 ; GFX9-NEXT: s_mov_b32 s4, 0x7060607
443 ; GFX9-NEXT: s_waitcnt vmcnt(0)
444 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
445 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
447 ; GFX9-NEXT: s_setpc_b64 s[30:31]
448 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
449 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 6, i32 3>
450 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
454 define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
455 ; GFX10-LABEL: shuffle2763ud2:
457 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
459 ; GFX10-NEXT: s_waitcnt vmcnt(0)
460 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
461 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
462 ; GFX10-NEXT: s_setpc_b64 s[30:31]
464 ; GFX9-LABEL: shuffle2763ud2:
466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
468 ; GFX9-NEXT: s_mov_b32 s4, 0x7060706
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
471 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
473 ; GFX9-NEXT: s_setpc_b64 s[30:31]
474 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
475 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 2, i32 7, i32 6, i32 3>
476 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
480 define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
481 ; GFX10-LABEL: shuffle1327ud2:
483 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
485 ; GFX10-NEXT: s_waitcnt vmcnt(0)
486 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060705
487 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
488 ; GFX10-NEXT: s_setpc_b64 s[30:31]
490 ; GFX9-LABEL: shuffle1327ud2:
492 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
494 ; GFX9-NEXT: s_mov_b32 s4, 0x7060705
495 ; GFX9-NEXT: s_waitcnt vmcnt(0)
496 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
497 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
498 ; GFX9-NEXT: s_waitcnt vmcnt(0)
499 ; GFX9-NEXT: s_setpc_b64 s[30:31]
500 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
501 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 7>
502 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
506 define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
507 ; GFX10-LABEL: shuffle0605ud2:
509 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
511 ; GFX10-NEXT: s_waitcnt vmcnt(0)
512 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
513 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
514 ; GFX10-NEXT: s_setpc_b64 s[30:31]
516 ; GFX9-LABEL: shuffle0605ud2:
518 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
520 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
521 ; GFX9-NEXT: s_waitcnt vmcnt(0)
522 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
523 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
524 ; GFX9-NEXT: s_waitcnt vmcnt(0)
525 ; GFX9-NEXT: s_setpc_b64 s[30:31]
526 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
527 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5>
528 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
532 define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
533 ; GFX10-LABEL: insertUsesOr:
535 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
537 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4
538 ; GFX10-NEXT: s_waitcnt vmcnt(0)
539 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
540 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
541 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
542 ; GFX10-NEXT: s_setpc_b64 s[30:31]
544 ; GFX9-LABEL: insertUsesOr:
546 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
548 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v4
549 ; GFX9-NEXT: s_waitcnt vmcnt(0)
550 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
551 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
552 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
553 ; GFX9-NEXT: s_waitcnt vmcnt(0)
554 ; GFX9-NEXT: s_setpc_b64 s[30:31]
555 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
556 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
557 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
558 %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
559 store <4 x i8> %vecins, ptr addrspace(1) %out0
563 define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
564 ; GFX10-LABEL: addUsesOr:
566 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
568 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
569 ; GFX10-NEXT: s_waitcnt vmcnt(1)
570 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
571 ; GFX10-NEXT: s_waitcnt vmcnt(0)
572 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v7
573 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4
574 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v7
575 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
576 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7
577 ; GFX10-NEXT: v_add_nc_u16 v2, v2, v3
578 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
579 ; GFX10-NEXT: v_add_nc_u16 v1, v4, v1
580 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
581 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
582 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
583 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
584 ; GFX10-NEXT: s_setpc_b64 s[30:31]
586 ; GFX9-LABEL: addUsesOr:
588 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
590 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
592 ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
593 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
594 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
595 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
596 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
597 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
598 ; GFX9-NEXT: s_waitcnt vmcnt(0)
599 ; GFX9-NEXT: s_setpc_b64 s[30:31]
600 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
601 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
602 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 0, i32 6, i32 3>
603 %added = add <4 x i8> %shuffle0_0, %vec1
604 store <4 x i8> %added, ptr addrspace(1) %out0
609 define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 {
610 ; GFX10-LABEL: shuffle8i8:
611 ; GFX10: ; %bb.0: ; %bb
612 ; GFX10-NEXT: s_clause 0x1
613 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
614 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
615 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
616 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
617 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
618 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
619 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
620 ; GFX10-NEXT: s_bfe_u32 s2, s5, 0x80008
621 ; GFX10-NEXT: s_lshl_b32 s1, s9, 8
622 ; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100010
623 ; GFX10-NEXT: s_bfe_u32 s0, s4, 0x80008
624 ; GFX10-NEXT: s_lshl_b32 s3, s8, 8
625 ; GFX10-NEXT: s_and_b32 s5, s8, 0xff00
626 ; GFX10-NEXT: s_bfe_u32 s8, s4, 0x80010
627 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff
628 ; GFX10-NEXT: s_or_b32 s1, s2, s1
629 ; GFX10-NEXT: s_lshl_b32 s2, s9, 8
630 ; GFX10-NEXT: s_or_b32 s0, s0, s3
631 ; GFX10-NEXT: s_or_b32 s3, s8, s5
632 ; GFX10-NEXT: s_or_b32 s2, s4, s2
633 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
634 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16
635 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
636 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16
637 ; GFX10-NEXT: s_or_b32 s0, s0, s1
638 ; GFX10-NEXT: s_or_b32 s1, s2, s3
639 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
640 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
641 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
642 ; GFX10-NEXT: s_endpgm
644 ; GFX9-LABEL: shuffle8i8:
645 ; GFX9: ; %bb.0: ; %bb
646 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
647 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
648 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
651 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
652 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX9-NEXT: s_bfe_u32 s0, s4, 0x80008
654 ; GFX9-NEXT: s_lshl_b32 s1, s9, 8
655 ; GFX9-NEXT: s_bfe_u32 s2, s5, 0x80008
656 ; GFX9-NEXT: s_lshl_b32 s3, s8, 8
657 ; GFX9-NEXT: s_or_b32 s1, s2, s1
658 ; GFX9-NEXT: s_or_b32 s0, s0, s3
659 ; GFX9-NEXT: s_bfe_u32 s2, s4, 0x80010
660 ; GFX9-NEXT: s_and_b32 s3, s4, 0xff
661 ; GFX9-NEXT: s_bfe_u32 s4, s9, 0x100010
662 ; GFX9-NEXT: s_and_b32 s5, s8, 0xff00
663 ; GFX9-NEXT: s_lshl_b32 s4, s4, 8
664 ; GFX9-NEXT: s_or_b32 s2, s2, s5
665 ; GFX9-NEXT: s_or_b32 s3, s3, s4
666 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
667 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16
668 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
669 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16
670 ; GFX9-NEXT: s_or_b32 s2, s3, s2
671 ; GFX9-NEXT: s_or_b32 s0, s0, s1
672 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
673 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
674 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
675 ; GFX9-NEXT: s_endpgm
677 %vec0 = load <8 x i8>, ptr addrspace(1) %in0
678 %vec1 = load <8 x i8>, ptr addrspace(1) %in1
679 %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9>
680 store <8 x i8> %shuffle0, ptr addrspace(1) %out1
684 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
685 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
687 ; Not combined to perm due to non-vectorized use, non-divergent
688 define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
691 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
693 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
694 ; GFX10-NEXT: s_waitcnt vmcnt(1)
695 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
696 ; GFX10-NEXT: s_waitcnt vmcnt(0)
697 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7
698 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v7
699 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4
700 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v7
701 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
702 ; GFX10-NEXT: v_add_nc_u16 v2, v7, v2
703 ; GFX10-NEXT: v_add_nc_u16 v3, v3, v7
704 ; GFX10-NEXT: v_add_nc_u16 v1, v1, v4
705 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
706 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
707 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
708 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
709 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
710 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
711 ; GFX10-NEXT: s_setpc_b64 s[30:31]
715 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
717 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
718 ; GFX9-NEXT: s_waitcnt vmcnt(0)
719 ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
720 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
721 ; GFX9-NEXT: v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
722 ; GFX9-NEXT: v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1
723 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
724 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
725 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
726 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
728 ; GFX9-NEXT: s_setpc_b64 s[30:31]
729 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
730 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
731 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
732 %vecins = add <4 x i8> %shuffle0_0, %vec1
733 store <4 x i8> %vecins, ptr addrspace(1) %out0
737 ; Not combined to perm due to non-vectorized use
738 define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
739 ; GFX10-LABEL: add_div:
741 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
743 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
744 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
745 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
746 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
747 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
748 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
749 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
750 ; GFX10-NEXT: s_waitcnt vmcnt(1)
751 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
752 ; GFX10-NEXT: s_waitcnt vmcnt(0)
753 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7
754 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
755 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4
756 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
757 ; GFX10-NEXT: v_add_nc_u16 v1, v1, v7
758 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
759 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
760 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
761 ; GFX10-NEXT: s_setpc_b64 s[30:31]
763 ; GFX9-LABEL: add_div:
765 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
767 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
768 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
769 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
770 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
771 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
772 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
773 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
774 ; GFX9-NEXT: s_waitcnt vmcnt(0)
775 ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
776 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
777 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
778 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
779 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
780 ; GFX9-NEXT: s_waitcnt vmcnt(0)
781 ; GFX9-NEXT: s_setpc_b64 s[30:31]
782 %tid = call i32 @llvm.amdgcn.workitem.id.x()
783 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
784 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
785 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
786 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
787 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
788 %vecins = add <4 x i8> %shuffle0_0, %vec1
789 store <4 x i8> %vecins, ptr addrspace(1) %out0
793 ; Not combined to perm due to non-divergent use
794 define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
795 ; GFX10-LABEL: add_store:
797 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
798 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
799 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
800 ; GFX10-NEXT: s_waitcnt vmcnt(1)
801 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
802 ; GFX10-NEXT: s_waitcnt vmcnt(0)
803 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
804 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
805 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
806 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
807 ; GFX10-NEXT: v_add_nc_u16 v3, v2, v9
808 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
809 ; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
810 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
811 ; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
812 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
813 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
814 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
815 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
816 ; GFX10-NEXT: s_setpc_b64 s[30:31]
818 ; GFX9-LABEL: add_store:
820 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
821 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
822 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
823 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
824 ; GFX9-NEXT: s_waitcnt vmcnt(1)
825 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
826 ; GFX9-NEXT: s_waitcnt vmcnt(0)
827 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
828 ; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
829 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
830 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
831 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
832 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
833 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
834 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
835 ; GFX9-NEXT: s_waitcnt vmcnt(0)
836 ; GFX9-NEXT: s_setpc_b64 s[30:31]
837 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
838 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
839 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
840 %vecins = add <4 x i8> %shuffle0_0, %vec1
841 store <4 x i8> %vecins, ptr addrspace(1) %out0
842 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
846 ; Not combined to perm due to 16 bit or
847 define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
848 ; GFX10-LABEL: add_store_div_16:
850 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
852 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
853 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
854 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
855 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
856 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
857 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
858 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
859 ; GFX10-NEXT: s_waitcnt vmcnt(1)
860 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
861 ; GFX10-NEXT: s_waitcnt vmcnt(0)
862 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
863 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
864 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
865 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
866 ; GFX10-NEXT: v_add_nc_u16 v3, v2, v9
867 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
868 ; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
869 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
870 ; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
871 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
872 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
873 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
874 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
875 ; GFX10-NEXT: s_setpc_b64 s[30:31]
877 ; GFX9-LABEL: add_store_div_16:
879 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
880 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
881 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
882 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
883 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
884 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
885 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
886 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
887 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
888 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
889 ; GFX9-NEXT: s_waitcnt vmcnt(1)
890 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
891 ; GFX9-NEXT: s_waitcnt vmcnt(0)
892 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
893 ; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
894 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
895 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
896 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
897 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
898 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
899 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
900 ; GFX9-NEXT: s_waitcnt vmcnt(0)
901 ; GFX9-NEXT: s_setpc_b64 s[30:31]
902 %tid = call i32 @llvm.amdgcn.workitem.id.x()
903 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
904 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
905 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
906 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
907 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
908 %vecins = add <4 x i8> %shuffle0_0, %vec1
909 store <4 x i8> %vecins, ptr addrspace(1) %out0
910 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
914 ; Vectorized use, divergent, 32 bit or
915 define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
916 ; GFX10-LABEL: add_store_div:
918 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
920 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
921 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
922 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
923 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
924 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
925 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
926 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
927 ; GFX10-NEXT: s_waitcnt vmcnt(1)
928 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
929 ; GFX10-NEXT: s_waitcnt vmcnt(0)
930 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
931 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v9
932 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4
933 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v9
934 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
935 ; GFX10-NEXT: v_add_nc_u16 v2, v9, v2
936 ; GFX10-NEXT: v_add_nc_u16 v3, v3, v9
937 ; GFX10-NEXT: v_add_nc_u16 v1, v1, v10
938 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
939 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
940 ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
941 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
942 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
943 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x10705
944 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
945 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
946 ; GFX10-NEXT: s_setpc_b64 s[30:31]
948 ; GFX9-LABEL: add_store_div:
950 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
952 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
953 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
954 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
955 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
956 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
957 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
958 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
959 ; GFX9-NEXT: s_mov_b32 s4, 0x10705
960 ; GFX9-NEXT: s_waitcnt vmcnt(0)
961 ; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4
962 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
963 ; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
964 ; GFX9-NEXT: v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
965 ; GFX9-NEXT: v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1
966 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
967 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
968 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
969 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
970 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
971 ; GFX9-NEXT: s_waitcnt vmcnt(0)
972 ; GFX9-NEXT: s_setpc_b64 s[30:31]
973 %tid = call i32 @llvm.amdgcn.workitem.id.x()
974 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
975 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
976 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
977 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
978 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
979 %vecins = add <4 x i8> %shuffle0_0, %vec1
980 store <4 x i8> %vecins, ptr addrspace(1) %out0
981 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
985 define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
986 ; GFX10-LABEL: and_store_div:
988 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
990 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
991 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
992 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
993 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
994 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
995 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
996 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
997 ; GFX10-NEXT: v_mov_b32_e32 v0, 2
998 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
999 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1000 ; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1001 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1002 ; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1003 ; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9
1004 ; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1005 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
1006 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1007 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1008 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006
1009 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1010 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1011 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1013 ; GFX9-LABEL: and_store_div:
1015 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1017 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1018 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1019 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1020 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1021 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1022 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1023 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1024 ; GFX9-NEXT: s_mov_b32 s4, 0x5070006
1025 ; GFX9-NEXT: v_mov_b32_e32 v0, 2
1026 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
1027 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1028 ; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1029 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1030 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
1031 ; GFX9-NEXT: v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1032 ; GFX9-NEXT: v_and_b32_e32 v9, 0x100, v4
1033 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1034 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1035 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1036 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1037 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1038 ; GFX9-NEXT: global_store_dword v[7:8], v2, off
1039 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1040 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1041 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1042 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1043 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1044 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1045 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1046 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 4, i32 3, i32 1>
1047 %vecins = and <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1048 store <4 x i8> %vecins, ptr addrspace(1) %out0
1049 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1053 define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1054 ; GFX10-LABEL: ashr_store_div:
1056 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1058 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1059 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1060 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1061 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1062 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4
1063 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1064 ; GFX10-NEXT: v_mov_b32_e32 v2, 26
1065 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1066 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1067 ; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8
1068 ; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1069 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 25, v9
1070 ; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1
1071 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1072 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1073 ; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0
1074 ; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707
1075 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1076 ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1077 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1078 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
1079 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
1080 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1082 ; GFX9-LABEL: ashr_store_div:
1084 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1085 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1086 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1087 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1088 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1089 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1090 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1091 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1092 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1093 ; GFX9-NEXT: v_mov_b32_e32 v1, 7
1094 ; GFX9-NEXT: s_mov_b32 s4, 0x4010707
1095 ; GFX9-NEXT: v_mov_b32_e32 v0, 26
1096 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1097 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1098 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1099 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
1100 ; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1101 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 25, v4
1102 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9
1103 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1104 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1105 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1106 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1107 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1108 ; GFX9-NEXT: global_store_dword v[7:8], v2, off
1109 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1110 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1111 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1112 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1113 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1114 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1115 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1116 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0>
1117 %vecins = ashr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1118 store <4 x i8> %vecins, ptr addrspace(1) %out0
1119 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1123 define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1124 ; GFX10-LABEL: bc_store_div:
1126 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1127 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1128 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1129 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1130 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1131 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1132 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1133 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1134 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1135 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1136 ; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x7060104
1137 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
1138 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1139 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1141 ; GFX9-LABEL: bc_store_div:
1143 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1145 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1146 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1147 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1148 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1149 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1150 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1151 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1152 ; GFX9-NEXT: s_mov_b32 s4, 0x7060104
1153 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1154 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
1155 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1156 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1157 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1158 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1159 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1160 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1161 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1162 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1163 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1164 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1165 %insvec = bitcast <4 x i8> %shuffle0_0 to i32
1166 store i32 %insvec, ptr addrspace(1) %out1
1167 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1172 define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
1173 ; GFX10-LABEL: eve_store_div:
1175 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1177 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1178 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1179 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1180 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1181 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1182 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1183 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1184 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1185 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
1186 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1187 ; GFX10-NEXT: v_perm_b32 v1, v5, v4, 0x1020305
1188 ; GFX10-NEXT: global_store_byte v[9:10], v0, off
1189 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1190 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1192 ; GFX9-LABEL: eve_store_div:
1194 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1195 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1196 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1197 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1198 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1199 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1200 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1201 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1202 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1203 ; GFX9-NEXT: s_mov_b32 s4, 0x1020305
1204 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1205 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4
1206 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
1208 ; GFX9-NEXT: global_store_byte v[9:10], v1, off
1209 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1210 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1211 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1212 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1213 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1214 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1215 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1216 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1217 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 3, i32 2, i32 1>
1218 %tmp = extractelement <4 x i8> %shuffle0_0, i32 1
1219 store i8 %tmp, ptr addrspace(1) %out2
1220 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1224 ; Not combined to perm due to multi use of or operands (introduced by insert op)
1225 define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1226 ; GFX10-LABEL: ive_store_div:
1228 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1229 ; GFX10-NEXT: v_and_b32_e32 v9, 0x3ff, v31
1230 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 2, v9
1231 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v9
1232 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1233 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v9
1234 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1235 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1236 ; GFX10-NEXT: global_load_dword v10, v[2:3], off
1237 ; GFX10-NEXT: v_mov_b32_e32 v0, 16
1238 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
1239 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v4
1240 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1241 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1242 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1243 ; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1244 ; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1245 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
1246 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1247 ; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
1248 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1249 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1250 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1252 ; GFX9-LABEL: ive_store_div:
1254 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255 ; GFX9-NEXT: v_and_b32_e32 v9, 0x3ff, v31
1256 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 2, v9
1257 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v9
1258 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1259 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9
1260 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1261 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1262 ; GFX9-NEXT: global_load_dword v10, v[2:3], off
1263 ; GFX9-NEXT: s_movk_i32 s4, 0xff
1264 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
1265 ; GFX9-NEXT: s_mov_b32 s5, 0x2000706
1266 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1267 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1269 ; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1270 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1271 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
1272 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1273 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1274 ; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5
1275 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1276 ; GFX9-NEXT: global_store_dword v[7:8], v3, off
1277 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1278 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1279 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1280 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1281 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1282 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1283 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1284 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2>
1285 %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
1286 store <4 x i8> %vecins, ptr addrspace(1) %out0
1287 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1292 define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1293 ; GFX10-LABEL: lhsr_store_div:
1295 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1297 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1298 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1299 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1300 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1301 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1302 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1303 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1304 ; GFX10-NEXT: v_mov_b32_e32 v0, 26
1305 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1306 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4
1307 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1308 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1309 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9
1310 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4
1311 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1
1312 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
1313 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1314 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1315 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707
1316 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1317 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1318 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1320 ; GFX9-LABEL: lhsr_store_div:
1322 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1323 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1324 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1325 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1326 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1327 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1328 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1329 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1330 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1331 ; GFX9-NEXT: v_mov_b32_e32 v0, 26
1332 ; GFX9-NEXT: s_mov_b32 s4, 0x1030707
1333 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1334 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, 1, v4
1335 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1336 ; GFX9-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1337 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 25, v9
1338 ; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
1339 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 26, v4
1340 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
1341 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f00, v3
1342 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1343 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1344 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1345 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
1346 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1347 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1348 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1349 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1350 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1351 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1352 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1353 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
1354 %vecins = lshr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1355 store <4 x i8> %vecins, ptr addrspace(1) %out0
1356 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1361 define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1362 ; GFX10-LABEL: mul_store_div:
1364 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1366 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1367 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1368 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1369 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1370 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1371 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1372 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1373 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1374 ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v4
1375 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1376 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v9
1377 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v9
1378 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v9
1379 ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v2
1380 ; GFX10-NEXT: v_mul_lo_u16 v1, v3, v1
1381 ; GFX10-NEXT: v_mul_lo_u16 v2, v4, v9
1382 ; GFX10-NEXT: v_mul_lo_u16 v3, v9, v3
1383 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
1384 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
1385 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1386 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1387 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1388 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2000504
1389 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1390 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1391 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1393 ; GFX9-LABEL: mul_store_div:
1395 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1396 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1397 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1398 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1399 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1400 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1401 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1402 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1403 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1404 ; GFX9-NEXT: s_mov_b32 s4, 0x2000504
1405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1406 ; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4
1407 ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v4, v9
1408 ; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1409 ; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3
1410 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1411 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1412 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1413 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1414 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
1415 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1416 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1417 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1418 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1419 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1420 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1421 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1422 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1423 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6>
1424 %vecins = mul <4 x i8> %shuffle0_0, %vec1
1425 store <4 x i8> %vecins, ptr addrspace(1) %out0
1426 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1431 define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1432 ; GFX10-LABEL: or_store_div:
1434 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1435 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1436 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1437 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1438 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1439 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1440 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1441 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1442 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1443 ; GFX10-NEXT: v_mov_b32_e32 v0, 16
1444 ; GFX10-NEXT: v_bfrev_b32_e32 v2, 4.0
1445 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1446 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4
1447 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1448 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1449 ; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1450 ; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1451 ; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1
1452 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1453 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1454 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005
1455 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1456 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1457 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1459 ; GFX9-LABEL: or_store_div:
1461 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1462 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1463 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1464 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1465 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1466 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
1467 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1468 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1469 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1470 ; GFX9-NEXT: s_mov_b32 s4, 0x2010005
1471 ; GFX9-NEXT: s_movk_i32 s5, 0x102
1472 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1473 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1474 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2
1475 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1476 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1477 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1478 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1479 ; GFX9-NEXT: v_perm_b32 v4, v0, v2, s4
1480 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1481 ; GFX9-NEXT: v_or_b32_e32 v0, 0x201, v0
1482 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1483 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
1484 ; GFX9-NEXT: global_store_dword v[7:8], v4, off
1485 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1486 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1487 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1488 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1489 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1490 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1491 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1492 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
1493 %vecins = or <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1494 store <4 x i8> %vecins, ptr addrspace(1) %out0
1495 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1499 define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1500 ; GFX10-LABEL: sdiv_store_div:
1502 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1503 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1504 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1505 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1506 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1507 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1508 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1509 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1510 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1511 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1512 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1513 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1514 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1515 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1516 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1517 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1518 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1
1519 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10
1520 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1521 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12
1522 ; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1523 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14
1524 ; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1525 ; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1526 ; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
1527 ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1528 ; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15
1529 ; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16
1530 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3
1531 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17
1532 ; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
1533 ; GFX10-NEXT: v_trunc_f32_e32 v15, v15
1534 ; GFX10-NEXT: v_trunc_f32_e32 v16, v16
1535 ; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18
1536 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17
1537 ; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11
1538 ; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2
1539 ; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19
1540 ; GFX10-NEXT: v_or_b32_e32 v3, 1, v3
1541 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18
1542 ; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2
1543 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
1544 ; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13
1545 ; GFX10-NEXT: v_or_b32_e32 v11, 1, v11
1546 ; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1
1547 ; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15
1548 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
1549 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
1550 ; GFX10-NEXT: v_or_b32_e32 v13, 1, v13
1551 ; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16
1552 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
1553 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
1554 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo
1555 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12|
1556 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0
1557 ; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1558 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
1559 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
1560 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1561 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2
1562 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo
1563 ; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1564 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1565 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1566 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706
1567 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1568 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1569 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1571 ; GFX9-LABEL: sdiv_store_div:
1573 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1575 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1576 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1577 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1578 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1579 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1580 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
1581 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1582 ; GFX9-NEXT: s_mov_b32 s4, 0x60706
1583 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1584 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1585 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1586 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1587 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
1588 ; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1589 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1590 ; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1591 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1592 ; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1593 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1594 ; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
1595 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1596 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2
1597 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12
1598 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13
1599 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4
1600 ; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15
1601 ; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16
1602 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15
1603 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1604 ; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17
1605 ; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18
1606 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16
1607 ; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3
1608 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10
1609 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
1610 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17
1611 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18
1612 ; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11
1613 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2|
1614 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9
1615 ; GFX9-NEXT: v_or_b32_e32 v10, 1, v10
1616 ; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15
1617 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16
1618 ; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3
1619 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17
1620 ; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2
1621 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
1622 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1623 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12|
1624 ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14
1625 ; GFX9-NEXT: v_or_b32_e32 v9, 1, v9
1626 ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
1627 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13|
1628 ; GFX9-NEXT: v_or_b32_e32 v14, 1, v14
1629 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
1630 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4|
1631 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc
1632 ; GFX9-NEXT: v_add_u32_e32 v1, v15, v1
1633 ; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1634 ; GFX9-NEXT: v_add_u32_e32 v3, v17, v3
1635 ; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1636 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1637 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1638 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1639 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
1640 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1642 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1643 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1644 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1645 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1646 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1647 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1648 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 3, i32 2, i32 4>
1649 %vecins = sdiv <4 x i8> %shuffle0_0, %vec1
1650 store <4 x i8> %vecins, ptr addrspace(1) %out0
1651 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1656 define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1657 ; GFX10-LABEL: sext_store_div:
1659 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1660 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1661 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1662 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1663 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1664 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1665 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1666 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1667 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1668 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1669 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1670 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1671 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1672 ; GFX10-NEXT: v_ashrrev_i16 v2, 8, v4
1673 ; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
1674 ; GFX10-NEXT: v_ashrrev_i16 v3, 8, v1
1675 ; GFX10-NEXT: v_perm_b32 v1, v0, v2, 0x5040100
1676 ; GFX10-NEXT: v_perm_b32 v0, v3, v3, 0x5040100
1677 ; GFX10-NEXT: v_perm_b32 v2, v9, v4, 0x3010707
1678 ; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
1679 ; GFX10-NEXT: global_store_dword v[5:6], v2, off
1680 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1682 ; GFX9-LABEL: sext_store_div:
1684 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1685 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1686 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1687 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1688 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1689 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1690 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1691 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1692 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1693 ; GFX9-NEXT: v_mov_b32_e32 v0, 8
1694 ; GFX9-NEXT: s_mov_b32 s5, 0x5040100
1695 ; GFX9-NEXT: s_mov_b32 s4, 0x3010707
1696 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1697 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9
1698 ; GFX9-NEXT: v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1699 ; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1700 ; GFX9-NEXT: v_perm_b32 v1, v3, v1, s5
1701 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s5
1702 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
1703 ; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
1704 ; GFX9-NEXT: global_store_dword v[5:6], v2, off
1705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1706 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1707 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1708 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1709 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1710 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1711 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1712 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
1713 %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16>
1714 store <4 x i16> %insvec, ptr addrspace(1) %out1
1715 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1720 define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1721 ; GFX10-LABEL: shl_store_div:
1723 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1725 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1726 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1727 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1728 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1729 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1730 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1731 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
1732 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1733 ; GFX10-NEXT: v_lshlrev_b16 v0, 2, v4
1734 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1735 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v9
1736 ; GFX10-NEXT: v_and_b32_e32 v2, 0xfffffc00, v0
1737 ; GFX10-NEXT: v_and_b32_e32 v3, 0xfe, v1
1738 ; GFX10-NEXT: v_and_b32_e32 v1, 0xfffffe00, v1
1739 ; GFX10-NEXT: v_and_b32_e32 v0, 0xfc, v0
1740 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
1741 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1742 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5000104
1743 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1744 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1745 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1746 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1748 ; GFX9-LABEL: shl_store_div:
1750 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1752 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1753 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1754 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1755 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1756 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1757 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1758 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
1759 ; GFX9-NEXT: s_mov_b32 s4, 0x5000104
1760 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1761 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 2, v4
1762 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1763 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 1, v9
1764 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
1765 ; GFX9-NEXT: v_and_b32_e32 v3, 0xfffffc00, v1
1766 ; GFX9-NEXT: v_and_b32_e32 v4, 0xfe, v2
1767 ; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffe00, v2
1768 ; GFX9-NEXT: v_and_b32_e32 v1, 0xfc, v1
1769 ; GFX9-NEXT: v_or_b32_e32 v3, v4, v3
1770 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1771 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1772 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
1773 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
1774 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1775 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1776 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1777 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1778 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1779 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1780 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1781 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5>
1782 %vecins = shl <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1783 store <4 x i8> %vecins, ptr addrspace(1) %out0
1784 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1789 define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1790 ; GFX10-LABEL: sitofp_store_div:
1792 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1793 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1794 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1795 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1796 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1797 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1798 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1799 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1800 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1801 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1802 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1803 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1804 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1805 ; GFX10-NEXT: v_ashrrev_i16 v2, 8, v9
1806 ; GFX10-NEXT: v_ashrrev_i16 v3, 8, v4
1807 ; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x6010205
1808 ; GFX10-NEXT: v_bfe_i32 v10, v0, 0, 8
1809 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
1810 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1811 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1812 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1813 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1814 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
1815 ; GFX10-NEXT: global_store_dword v[5:6], v4, off
1816 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1818 ; GFX9-LABEL: sitofp_store_div:
1820 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1821 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1822 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1823 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1824 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1825 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1826 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
1827 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
1828 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1829 ; GFX9-NEXT: s_mov_b32 s4, 0x6010205
1830 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1831 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9
1832 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9
1833 ; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 8
1834 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1835 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
1836 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 8, v4
1837 ; GFX9-NEXT: v_bfe_i32 v11, v2, 0, 8
1838 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1839 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1840 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1841 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1842 ; GFX9-NEXT: v_perm_b32 v4, v4, v9, s4
1843 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
1844 ; GFX9-NEXT: global_store_dword v[5:6], v4, off
1845 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1846 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1847 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1848 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1849 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1850 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1851 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1852 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 1, i32 6>
1853 %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float>
1854 store <4 x float> %insvec, ptr addrspace(1) %out1
1855 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1860 define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1861 ; GFX10-LABEL: srem_store_div:
1863 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1865 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1866 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
1867 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1868 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
1869 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1870 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
1871 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1872 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1873 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1874 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1875 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1876 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1877 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1878 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1879 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2
1880 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13
1881 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3
1882 ; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1883 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15
1884 ; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1885 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1886 ; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
1887 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1888 ; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
1889 ; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17
1890 ; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18
1891 ; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19
1892 ; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11
1893 ; GFX10-NEXT: v_or_b32_e32 v1, 1, v1
1894 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17
1895 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18
1896 ; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20
1897 ; GFX10-NEXT: v_trunc_f32_e32 v19, v19
1898 ; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14
1899 ; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3
1900 ; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12
1901 ; GFX10-NEXT: v_or_b32_e32 v11, 1, v11
1902 ; GFX10-NEXT: v_trunc_f32_e32 v20, v20
1903 ; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15
1904 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2|
1905 ; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16
1906 ; GFX10-NEXT: v_or_b32_e32 v14, 1, v14
1907 ; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21
1908 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
1909 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
1910 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13|
1911 ; GFX10-NEXT: v_or_b32_e32 v16, 1, v16
1912 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
1913 ; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19
1914 ; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20
1915 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
1916 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3|
1917 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1918 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4
1919 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4
1920 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1
1921 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo
1922 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15|
1923 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2
1924 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4
1925 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3
1926 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo
1927 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10
1928 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0
1929 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11
1930 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1
1931 ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1932 ; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12
1933 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3
1934 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1935 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1936 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1937 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1938 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306
1939 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
1940 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
1941 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1943 ; GFX9-LABEL: srem_store_div:
1945 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1946 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1947 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
1948 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
1949 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1950 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1951 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1952 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
1953 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1954 ; GFX9-NEXT: s_mov_b32 s4, 0x2070306
1955 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1956 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1957 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1958 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1959 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1960 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1961 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14
1962 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1963 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10
1964 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
1965 ; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18
1966 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18
1967 ; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13
1968 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14|
1969 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v3
1970 ; GFX9-NEXT: v_mul_f32_e32 v14, v16, v19
1971 ; GFX9-NEXT: v_trunc_f32_e32 v14, v14
1972 ; GFX9-NEXT: v_mad_f32 v19, -v14, v10, v16
1973 ; GFX9-NEXT: v_mul_f32_e32 v13, v10, v13
1974 ; GFX9-NEXT: v_trunc_f32_e32 v13, v13
1975 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v10|
1976 ; GFX9-NEXT: v_mad_f32 v10, -v13, v3, v10
1977 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1978 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v10|, |v3|
1979 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v16
1980 ; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1981 ; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1982 ; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
1983 ; GFX9-NEXT: v_mul_f32_e32 v3, v19, v3
1984 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
1985 ; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12
1986 ; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
1987 ; GFX9-NEXT: v_cvt_i32_f32_e32 v13, v13
1988 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
1989 ; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14
1990 ; GFX9-NEXT: v_mad_f32 v19, -v3, v16, v19
1991 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
1992 ; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15
1993 ; GFX9-NEXT: v_or_b32_e32 v12, 1, v12
1994 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2
1995 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10
1996 ; GFX9-NEXT: v_or_b32_e32 v15, 1, v15
1997 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
1998 ; GFX9-NEXT: v_or_b32_e32 v10, 1, v10
1999 ; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
2000 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16|
2001 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
2002 ; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5]
2003 ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
2004 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
2005 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4
2006 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4
2007 ; GFX9-NEXT: v_add_u32_e32 v2, v13, v2
2008 ; GFX9-NEXT: v_add_u32_e32 v12, v18, v12
2009 ; GFX9-NEXT: v_add_u32_e32 v13, v14, v15
2010 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v10
2011 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4
2012 ; GFX9-NEXT: v_mul_lo_u32 v4, v12, v11
2013 ; GFX9-NEXT: v_mul_lo_u32 v10, v13, v0
2014 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v17
2015 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
2016 ; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2017 ; GFX9-NEXT: v_sub_u32_e32 v4, v17, v10
2018 ; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2019 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2020 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2021 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2022 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2023 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
2024 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2025 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2026 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2027 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2028 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2029 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2030 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2031 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 2>
2032 %vecins = srem <4 x i8> %shuffle0_0, %vec1
2033 store <4 x i8> %vecins, ptr addrspace(1) %out0
2034 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2039 define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2040 ; GFX10-LABEL: sub_store_div:
2042 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2043 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2045 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2046 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2047 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2048 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2049 ; GFX10-NEXT: global_load_dword v2, v[2:3], off
2050 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2051 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2052 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
2053 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2
2054 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v2
2055 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2056 ; GFX10-NEXT: v_sub_nc_u16 v3, v0, v3
2057 ; GFX10-NEXT: v_sub_nc_u16 v9, v1, v4
2058 ; GFX10-NEXT: v_sub_nc_u16 v10, v4, v2
2059 ; GFX10-NEXT: v_sub_nc_u16 v1, v4, v1
2060 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x6070007
2061 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
2062 ; GFX10-NEXT: v_lshlrev_b16 v4, 8, v9
2063 ; GFX10-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2064 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2065 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2066 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2067 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2068 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2070 ; GFX9-LABEL: sub_store_div:
2072 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2073 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2074 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2075 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2076 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2077 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2078 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2079 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2080 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2081 ; GFX9-NEXT: s_mov_b32 s4, 0x6070007
2082 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2083 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
2084 ; GFX9-NEXT: v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
2085 ; GFX9-NEXT: v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2086 ; GFX9-NEXT: v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3
2087 ; GFX9-NEXT: v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1
2088 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2089 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2090 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2091 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
2092 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
2093 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2094 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2095 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2096 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2097 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2098 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2099 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2100 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6>
2101 %vecins = sub <4 x i8> %shuffle0_0, %vec1
2102 store <4 x i8> %vecins, ptr addrspace(1) %out0
2103 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2108 define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
2109 ; GFX10-LABEL: sv_store_div:
2111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2112 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2113 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2114 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2115 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2116 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2117 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2118 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2119 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
2120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2121 ; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x50705
2122 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2123 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2125 ; GFX9-LABEL: sv_store_div:
2127 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2129 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2130 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2131 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2132 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2133 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2134 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2135 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
2136 ; GFX9-NEXT: s_mov_b32 s4, 0x50705
2137 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2138 ; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
2139 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
2140 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2141 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2142 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2143 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2144 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2145 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2146 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2147 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 1, i32 4>
2148 %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0>
2149 store <4 x i8> %insvec, ptr addrspace(1) %out1
2150 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2155 define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2156 ; GFX10-LABEL: trunc_store_div:
2158 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2159 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2160 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2161 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2162 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2163 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2164 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2165 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2166 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
2167 ; GFX10-NEXT: v_mov_b32_e32 v0, 1
2168 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2169 ; GFX10-NEXT: v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2170 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2171 ; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2172 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
2173 ; GFX10-NEXT: v_lshlrev_b16 v2, 2, v0
2174 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
2175 ; GFX10-NEXT: v_lshlrev_b16 v1, 3, v4
2176 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
2177 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
2178 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205
2179 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0
2180 ; GFX10-NEXT: global_store_byte v[7:8], v0, off
2181 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2182 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2184 ; GFX9-LABEL: trunc_store_div:
2186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2188 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2189 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2190 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2191 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2192 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2193 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2194 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2195 ; GFX9-NEXT: v_mov_b32_e32 v0, 1
2196 ; GFX9-NEXT: s_mov_b32 s4, 0x50205
2197 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2198 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 3, v4
2199 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2200 ; GFX9-NEXT: v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2201 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2202 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
2203 ; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4
2204 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 2, v2
2205 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
2206 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
2207 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v3
2208 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
2209 ; GFX9-NEXT: global_store_byte v[7:8], v0, off
2210 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
2211 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2212 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2213 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2214 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2215 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2216 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2217 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2218 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0>
2219 %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1>
2220 store <4 x i1> %insvec, ptr addrspace(1) %out1
2221 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2225 define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2226 ; GFX10-LABEL: udiv:
2228 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2229 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2230 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2231 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2232 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2233 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2234 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2235 ; GFX10-NEXT: global_load_dword v2, v[2:3], off
2236 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2237 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2238 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
2239 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2
2240 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2
2241 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2242 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v14, v0
2243 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2
2244 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1
2245 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3
2246 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9
2247 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0
2248 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4
2249 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207
2250 ; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10
2251 ; GFX10-NEXT: v_mul_f32_e32 v11, v4, v11
2252 ; GFX10-NEXT: v_mul_f32_e32 v13, v1, v13
2253 ; GFX10-NEXT: v_mul_f32_e32 v12, v15, v12
2254 ; GFX10-NEXT: v_trunc_f32_e32 v10, v10
2255 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11
2256 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13
2257 ; GFX10-NEXT: v_trunc_f32_e32 v12, v12
2258 ; GFX10-NEXT: v_mad_f32 v14, -v10, v1, v14
2259 ; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10
2260 ; GFX10-NEXT: v_mad_f32 v16, -v11, v3, v4
2261 ; GFX10-NEXT: v_mad_f32 v17, -v13, v9, v1
2262 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
2263 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, v1
2264 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13
2265 ; GFX10-NEXT: v_mad_f32 v15, -v12, v4, v15
2266 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12
2267 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2268 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v16|, v3
2269 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2270 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v17|, v9
2271 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
2272 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2273 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v4
2274 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2275 ; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9
2276 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2277 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2278 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2279 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2280 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2281 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2285 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2286 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2287 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2288 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2289 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2290 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2291 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2292 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
2293 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
2294 ; GFX9-NEXT: s_mov_b32 s4, 0x40207
2295 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2296 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
2297 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2
2298 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
2299 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3
2300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2301 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9
2302 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4
2303 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10
2304 ; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11
2305 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
2306 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4
2307 ; GFX9-NEXT: v_trunc_f32_e32 v11, v11
2308 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4
2309 ; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12
2310 ; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1
2311 ; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11
2312 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9
2313 ; GFX9-NEXT: v_trunc_f32_e32 v12, v12
2314 ; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13
2315 ; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10
2316 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
2317 ; GFX9-NEXT: v_trunc_f32_e32 v13, v13
2318 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2
2319 ; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14
2320 ; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9
2321 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13
2322 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
2323 ; GFX9-NEXT: v_trunc_f32_e32 v14, v14
2324 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3
2325 ; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2
2326 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14
2327 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
2328 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10
2329 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
2330 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4
2331 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
2332 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
2333 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
2334 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2335 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2336 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2337 ; GFX9-NEXT: global_store_dword v[5:6], v1, off
2338 ; GFX9-NEXT: global_store_dword v[7:8], v0, off
2339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2340 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2341 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2342 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2343 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2344 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2345 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2346 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4>
2347 %vecins = udiv <4 x i8> %shuffle0_0, %vec1
2348 store <4 x i8> %vecins, ptr addrspace(1) %out0
2349 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2354 define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2355 ; GFX10-LABEL: uitofp_store_div:
2357 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2358 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2359 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2360 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2361 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2362 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2363 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2364 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
2365 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
2366 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2367 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
2368 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2369 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v9
2370 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v9
2371 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
2372 ; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5020104
2373 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
2374 ; GFX10-NEXT: global_store_dword v[5:6], v4, off
2375 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2377 ; GFX9-LABEL: uitofp_store_div:
2379 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2381 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2382 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2383 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2384 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2385 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2386 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2387 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2388 ; GFX9-NEXT: s_mov_b32 s4, 0x5020104
2389 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2390 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
2391 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2392 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v9
2393 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
2394 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
2395 ; GFX9-NEXT: v_perm_b32 v10, v9, v4, s4
2396 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
2397 ; GFX9-NEXT: global_store_dword v[5:6], v10, off
2398 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2399 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2400 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2401 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2402 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2403 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2404 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2405 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 2, i32 5>
2406 %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float>
2407 store <4 x float> %insvec, ptr addrspace(1) %out1
2408 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2413 define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2414 ; GFX10-LABEL: urem_store_div:
2416 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2418 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2419 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2420 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2421 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2422 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2423 ; GFX10-NEXT: global_load_dword v2, v[2:3], off
2424 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2425 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2426 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
2427 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2
2428 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2
2429 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2
2430 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2431 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0
2432 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1
2433 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3
2434 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4
2435 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9
2436 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2
2437 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2
2438 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2
2439 ; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10
2440 ; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11
2441 ; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12
2442 ; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13
2443 ; GFX10-NEXT: v_trunc_f32_e32 v10, v10
2444 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11
2445 ; GFX10-NEXT: v_trunc_f32_e32 v12, v12
2446 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13
2447 ; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3
2448 ; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10
2449 ; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3
2450 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
2451 ; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3
2452 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
2453 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12
2454 ; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15
2455 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13
2456 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2457 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3
2458 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
2459 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2460 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
2461 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16
2462 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1
2463 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2464 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
2465 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14
2466 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2467 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2468 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2469 ; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17
2470 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4
2471 ; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2472 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505
2473 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2474 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2475 ; GFX10-NEXT: global_store_dword v[5:6], v1, off
2476 ; GFX10-NEXT: global_store_dword v[7:8], v0, off
2477 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2479 ; GFX9-LABEL: urem_store_div:
2481 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2482 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2483 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2484 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2485 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2486 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2487 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2488 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
2489 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
2490 ; GFX9-NEXT: s_mov_b32 s4, 0x2050505
2491 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2492 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
2493 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2
2494 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4
2495 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v3
2496 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v4
2497 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v11
2498 ; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15
2499 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v14, v4
2500 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15
2501 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14
2502 ; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16
2503 ; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3
2504 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
2505 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16
2506 ; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17
2507 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2
2508 ; GFX9-NEXT: v_mad_f32 v2, -v16, v3, v3
2509 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16
2510 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2511 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9
2512 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17
2513 ; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18
2514 ; GFX9-NEXT: v_mad_f32 v19, -v17, v11, v3
2515 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17
2516 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
2517 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18
2518 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
2519 ; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13
2520 ; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18
2521 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc
2522 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v11
2523 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc
2524 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14
2525 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
2526 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4
2527 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4
2528 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc
2529 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4
2530 ; GFX9-NEXT: v_mul_lo_u32 v4, v15, v4
2531 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v10
2532 ; GFX9-NEXT: v_mul_lo_u32 v0, v3, v0
2533 ; GFX9-NEXT: v_mul_lo_u32 v3, v11, v12
2534 ; GFX9-NEXT: v_sub_u32_e32 v4, v10, v4
2535 ; GFX9-NEXT: v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2536 ; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0
2537 ; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2538 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2539 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2540 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2541 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2542 ; GFX9-NEXT: global_store_dword v[7:8], v1, off
2543 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2544 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2545 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2546 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2547 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2548 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2549 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2550 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 2>
2551 %vecins = urem <4 x i8> %shuffle0_0, %vec1
2552 store <4 x i8> %vecins, ptr addrspace(1) %out0
2553 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2558 define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2559 ; GFX10-LABEL: xor_store_div:
2561 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2563 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2564 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2565 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2566 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2567 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2568 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2569 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
2570 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00
2571 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
2572 ; GFX10-NEXT: v_mov_b32_e32 v2, 2
2573 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2574 ; GFX10-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2575 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2576 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
2577 ; GFX10-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2578 ; GFX10-NEXT: v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2579 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x200, v0
2580 ; GFX10-NEXT: v_xor_b32_e32 v3, 0x100, v3
2581 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
2582 ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2583 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2584 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5060307
2585 ; GFX10-NEXT: global_store_dword v[5:6], v0, off
2586 ; GFX10-NEXT: global_store_dword v[7:8], v1, off
2587 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2589 ; GFX9-LABEL: xor_store_div:
2591 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2592 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2593 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2594 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2595 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2596 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2597 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2598 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2599 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2600 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
2601 ; GFX9-NEXT: v_mov_b32_e32 v0, 1
2602 ; GFX9-NEXT: v_mov_b32_e32 v1, 2
2603 ; GFX9-NEXT: s_mov_b32 s5, 0x5060307
2604 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2605 ; GFX9-NEXT: v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2606 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2607 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
2608 ; GFX9-NEXT: v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2609 ; GFX9-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2610 ; GFX9-NEXT: v_xor_b32_e32 v2, 0x200, v2
2611 ; GFX9-NEXT: v_xor_b32_e32 v3, 0x100, v3
2612 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
2613 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2614 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2615 ; GFX9-NEXT: v_perm_b32 v4, v9, v4, s5
2616 ; GFX9-NEXT: global_store_dword v[5:6], v0, off
2617 ; GFX9-NEXT: global_store_dword v[7:8], v4, off
2618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2619 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2620 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2621 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2622 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2623 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2624 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2625 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 3, i32 6, i32 5>
2626 %vecins = xor <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
2627 store <4 x i8> %vecins, ptr addrspace(1) %out0
2628 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2633 define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2634 ; GFX10-LABEL: zext_store_div:
2636 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2637 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2638 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2639 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2640 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2641 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2642 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2643 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2644 ; GFX10-NEXT: global_load_dword v9, v[2:3], off
2645 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
2646 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2647 ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4
2648 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v4
2649 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2650 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v9
2651 ; GFX10-NEXT: v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2652 ; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x5040100
2653 ; GFX10-NEXT: v_perm_b32 v2, v4, v9, 0x60504
2654 ; GFX10-NEXT: v_perm_b32 v1, v3, v10, 0x5040100
2655 ; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
2656 ; GFX10-NEXT: global_store_dword v[5:6], v2, off
2657 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2659 ; GFX9-LABEL: zext_store_div:
2661 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2662 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2663 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4
2664 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
2665 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2666 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
2667 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2668 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2669 ; GFX9-NEXT: global_load_dword v9, v[2:3], off
2670 ; GFX9-NEXT: s_mov_b32 s4, 0x60504
2671 ; GFX9-NEXT: s_movk_i32 s5, 0xff
2672 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100
2673 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2674 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4
2675 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2676 ; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
2677 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v4
2678 ; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v9
2679 ; GFX9-NEXT: v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2680 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6
2681 ; GFX9-NEXT: v_perm_b32 v1, v3, v4, s6
2682 ; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
2683 ; GFX9-NEXT: global_store_dword v[5:6], v2, off
2684 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2685 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2686 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2687 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2688 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2689 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2690 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2691 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
2692 %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>
2693 store <4 x i16> %insvec, ptr addrspace(1) %out1
2694 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2698 define void @Source16Bit(i16 %in, <2 x i16> %reg) {
2699 ; GFX10-LABEL: Source16Bit:
2700 ; GFX10: ; %bb.0: ; %entry
2701 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2702 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
2703 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
2704 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2706 ; GFX9-LABEL: Source16Bit:
2707 ; GFX9: ; %bb.0: ; %entry
2708 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2709 ; GFX9-NEXT: s_mov_b32 s4, 0x3050204
2710 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
2711 ; GFX9-NEXT: global_store_dword v[0:1], v0, off
2712 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2713 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2715 %elt0 = extractelement <2 x i16> %reg, i32 1
2716 %e0b0 = and i16 %elt0, 255
2717 %e0b1 = and i16 %elt0, -256
2718 %e1b0 = and i16 %in, 255
2719 %e1b1 = and i16 %in, -256
2720 %tmp0 = shl i16 %e0b0, 8
2721 %byte0 = or i16 %tmp0, %e1b0
2722 %tmp2 = lshr i16 %e1b1, 8
2723 %byte1 = or i16 %e0b1, %tmp2
2724 %ext0 = zext i16 %byte0 to i32
2725 %ext1 = zext i16 %byte1 to i32
2726 %shifted = shl i32 %ext1, 16
2727 %result = or i32 %shifted, %ext0
2728 store i32 %result, ptr addrspace(1) undef
2732 define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2733 ; GFX10-LABEL: extract3744:
2735 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2737 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2738 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2739 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2740 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2741 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2743 ; GFX9-LABEL: extract3744:
2745 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2746 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2747 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2748 ; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2749 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2750 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
2751 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2752 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2753 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2754 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2755 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2756 %v1e0 = extractelement <4 x i8> %vec1, i64 0
2757 %zv1e0 = zext i8 %v1e0 to i32
2758 %byte1 = shl i32 %zv1e0, 8
2760 %v1e3 = extractelement <4 x i8> %vec1, i64 3
2761 %zv1e3 = zext i8 %v1e3 to i32
2762 %byte2 = shl i32 %zv1e3, 16
2763 %v2e3 = extractelement <4 x i8> %vec2, i64 3
2764 %zv2e3 = zext i8 %v2e3 to i32
2765 %byte3 = shl i32 %zv2e3, 24
2767 %tmp0 = or i32 %zv1e0, %byte1
2768 %tmp1 = or i32 %tmp0, %byte2
2769 %res = or i32 %tmp1, %byte3
2770 store i32 %res, ptr addrspace(1) %out0, align 4
2774 declare i32 @llvm.amdgcn.perm(i32, i32, i32)
2776 define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2777 ; GFX10-LABEL: extract_perm_3744:
2779 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2780 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2781 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2782 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2783 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2784 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2785 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2787 ; GFX9-LABEL: extract_perm_3744:
2789 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2790 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2791 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2792 ; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2793 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2794 ; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
2795 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2796 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2797 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2798 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2799 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2800 %cast1 = bitcast <4 x i8> %vec1 to i32
2801 %cast2 = bitcast <4 x i8> %vec2 to i32
2802 %lo24 = call i32 @llvm.amdgcn.perm(i32 %cast1, i32 %cast1, i32 201523200)
2803 %hi8 = call i32 @llvm.amdgcn.perm(i32 %cast2, i32 %cast2, i32 51121164)
2804 %res = or i32 %hi8, %lo24
2805 store i32 %res, ptr addrspace(1) %out0, align 4
2809 define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2810 ; GFX10-LABEL: extract1347_v2i16:
2812 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2814 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2815 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2816 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407
2817 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2818 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2820 ; GFX9-LABEL: extract1347_v2i16:
2822 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2823 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2824 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2825 ; GFX9-NEXT: s_mov_b32 s4, 0x1030407
2826 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2827 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2828 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2829 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2830 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2831 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2832 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2833 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2834 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2835 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2836 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2838 %b0t0 = and i16 -256, %v2e1
2839 %b0t1 = lshr i16 %b0t0, 8
2840 %byte0 = zext i16 %b0t1 to i32
2842 %b1t0 = and i16 255, %v2e0
2843 %b1t1 = zext i16 %b1t0 to i32
2844 %byte1 = shl i32 %b1t1, 8
2846 %b2t0 = and i16 -256, %v1e1
2847 %b2t1 = lshr i16 %b2t0, 8
2848 %b2t2 = zext i16 %b2t1 to i32
2849 %byte2 = shl i32 %b2t2, 16
2851 %b3t0 = and i16 -256, %v1e0
2852 %b3t1 = lshr i16 %b3t0, 8
2853 %b3t2 = zext i16 %b3t1 to i32
2854 %byte3 = shl i32 %b3t2, 24
2856 %tmp0 = or i32 %byte0, %byte1
2857 %tmp1 = or i32 %tmp0, %byte2
2858 %res = or i32 %tmp1, %byte3
2859 store i32 %res, ptr addrspace(1) %out0, align 4
2864 declare i16 @llvm.fshr.i16(i16, i16, i16)
2866 define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2867 ; GFX10-LABEL: fshri16_8:
2869 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2870 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2871 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2872 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2873 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
2874 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2875 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2877 ; GFX9-LABEL: fshri16_8:
2879 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2880 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2881 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2882 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
2883 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2884 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2885 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2886 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2887 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2888 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2889 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2890 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2891 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2892 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2893 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2895 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
2896 %byte01 = zext i16 %tmp01.0 to i32
2898 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
2899 %tmp23.1 = zext i16 %tmp23.0 to i32
2900 %byte23 = shl i32 %tmp23.1, 16
2901 %res = or i32 %byte01, %byte23
2902 store i32 %res, ptr addrspace(1) %out0, align 4
2906 define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2907 ; GFX10-LABEL: fshri16_16:
2909 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2910 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2911 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2912 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2913 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
2914 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2915 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2917 ; GFX9-LABEL: fshri16_16:
2919 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2920 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2921 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2922 ; GFX9-NEXT: s_mov_b32 s4, 0x3020706
2923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2924 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2925 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2926 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2927 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2928 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2929 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2930 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2931 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2932 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2933 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2935 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
2936 %byte01 = zext i16 %tmp01.0 to i32
2938 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
2939 %tmp23.1 = zext i16 %tmp23.0 to i32
2940 %byte23 = shl i32 %tmp23.1, 16
2941 %res = or i32 %byte01, %byte23
2942 store i32 %res, ptr addrspace(1) %out0, align 4
2946 define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2947 ; GFX10-LABEL: fshri16_24:
2949 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2950 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2951 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2952 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2953 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
2954 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2955 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2957 ; GFX9-LABEL: fshri16_24:
2959 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2960 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2961 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2962 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
2963 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2964 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
2965 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2966 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2967 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2968 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2969 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2970 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2971 %v1e1 = extractelement <2 x i16> %vec1, i64 1
2972 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2973 %v2e1 = extractelement <2 x i16> %vec2, i64 1
2975 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
2976 %byte01 = zext i16 %tmp01.0 to i32
2978 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
2979 %tmp23.1 = zext i16 %tmp23.0 to i32
2980 %byte23 = shl i32 %tmp23.1, 16
2981 %res = or i32 %byte01, %byte23
2982 store i32 %res, ptr addrspace(1) %out0, align 4
2986 define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2987 ; GFX10-LABEL: fshri16_32:
2989 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2991 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2992 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2993 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
2994 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2995 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2997 ; GFX9-LABEL: fshri16_32:
2999 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3000 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3001 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3002 ; GFX9-NEXT: s_mov_b32 s4, 0x3020706
3003 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3004 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3005 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3006 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3007 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3008 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3009 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3010 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3011 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3012 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3013 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3015 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
3016 %byte01 = zext i16 %tmp01.0 to i32
3018 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
3019 %tmp23.1 = zext i16 %tmp23.0 to i32
3020 %byte23 = shl i32 %tmp23.1, 16
3021 %res = or i32 %byte01, %byte23
3022 store i32 %res, ptr addrspace(1) %out0, align 4
3026 define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3027 ; GFX10-LABEL: fshri16_88:
3029 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3030 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3031 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3032 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3033 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3034 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3035 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3037 ; GFX9-LABEL: fshri16_88:
3039 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3040 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3041 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3042 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3043 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3044 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3045 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3046 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3047 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3048 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3049 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3050 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3051 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3052 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3053 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3055 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
3056 %byte01 = zext i16 %tmp01.0 to i32
3058 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
3059 %tmp23.1 = zext i16 %tmp23.0 to i32
3060 %byte23 = shl i32 %tmp23.1, 16
3061 %res = or i32 %byte01, %byte23
3062 store i32 %res, ptr addrspace(1) %out0, align 4
3066 declare i16 @llvm.fshl.i16(i16, i16, i16)
3068 define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3069 ; GFX10-LABEL: fshli16_1347:
3071 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3072 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3073 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3074 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3075 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3076 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3077 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3079 ; GFX9-LABEL: fshli16_1347:
3081 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3082 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3083 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3084 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3085 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3086 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3087 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3089 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3090 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3091 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3092 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3093 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3094 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3095 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3097 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
3098 %byte01 = zext i16 %tmp01.0 to i32
3100 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
3101 %tmp23.1 = zext i16 %tmp23.0 to i32
3102 %byte23 = shl i32 %tmp23.1, 16
3103 %res = or i32 %byte01, %byte23
3104 store i32 %res, ptr addrspace(1) %out0, align 4
3108 define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3109 ; GFX10-LABEL: fshli16_16:
3111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3113 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3114 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3115 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
3116 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3117 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3119 ; GFX9-LABEL: fshli16_16:
3121 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3122 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3123 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3124 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3125 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3126 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3127 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3128 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3129 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3130 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3131 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3132 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3133 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3134 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3135 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3137 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
3138 %byte01 = zext i16 %tmp01.0 to i32
3140 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
3141 %tmp23.1 = zext i16 %tmp23.0 to i32
3142 %byte23 = shl i32 %tmp23.1, 16
3143 %res = or i32 %byte01, %byte23
3144 store i32 %res, ptr addrspace(1) %out0, align 4
3148 define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3149 ; GFX10-LABEL: fshli16_24:
3151 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3153 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3154 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3155 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3156 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3157 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3159 ; GFX9-LABEL: fshli16_24:
3161 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3162 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3163 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3164 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3165 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3166 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3167 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3169 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3170 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3171 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3172 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3173 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3174 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3175 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3177 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
3178 %byte01 = zext i16 %tmp01.0 to i32
3180 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
3181 %tmp23.1 = zext i16 %tmp23.0 to i32
3182 %byte23 = shl i32 %tmp23.1, 16
3183 %res = or i32 %byte01, %byte23
3184 store i32 %res, ptr addrspace(1) %out0, align 4
3188 define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3189 ; GFX10-LABEL: fshli16_32:
3191 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3192 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3193 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3194 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3195 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
3196 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3197 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3199 ; GFX9-LABEL: fshli16_32:
3201 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3202 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3203 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3204 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3205 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3206 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3207 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3208 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3209 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3210 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3211 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3212 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3213 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3214 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3215 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3217 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
3218 %byte01 = zext i16 %tmp01.0 to i32
3220 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
3221 %tmp23.1 = zext i16 %tmp23.0 to i32
3222 %byte23 = shl i32 %tmp23.1, 16
3223 %res = or i32 %byte01, %byte23
3224 store i32 %res, ptr addrspace(1) %out0, align 4
3228 define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3229 ; GFX10-LABEL: fshli16_88:
3231 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3232 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3233 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
3234 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3235 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3236 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3237 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3239 ; GFX9-LABEL: fshli16_88:
3241 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3242 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3243 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
3244 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3245 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3246 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3247 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3248 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3249 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3250 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3251 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3252 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3253 %v1e1 = extractelement <2 x i16> %vec1, i64 1
3254 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3255 %v2e1 = extractelement <2 x i16> %vec2, i64 1
3257 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
3258 %byte01 = zext i16 %tmp01.0 to i32
3260 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
3261 %tmp23.1 = zext i16 %tmp23.0 to i32
3262 %byte23 = shl i32 %tmp23.1, 16
3263 %res = or i32 %byte01, %byte23
3264 store i32 %res, ptr addrspace(1) %out0, align 4
3268 define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
3269 ; GFX10-LABEL: shlbase:
3271 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3272 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3273 ; GFX10-NEXT: global_load_dword v8, v[2:3], off
3274 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6
3275 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v6
3276 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 8, v6
3277 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3278 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v7
3279 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3280 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3281 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3282 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, v3, v2
3283 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
3284 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3285 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3287 ; GFX9-LABEL: shlbase:
3289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3290 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3291 ; GFX9-NEXT: global_load_dword v8, v[2:3], off
3292 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v6
3293 ; GFX9-NEXT: v_add_u32_e32 v1, 16, v6
3294 ; GFX9-NEXT: v_add_u32_e32 v2, 24, v6
3295 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3296 ; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v7
3297 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3298 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3299 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3300 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, v0, v3
3301 ; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
3302 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3303 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3304 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3305 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
3306 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
3307 %v1e0 = extractelement <4 x i8> %vec1, i64 0
3308 %zv1e0 = zext i8 %v1e0 to i32
3309 %b8 = add i32 %base, 8
3310 %byte1 = shl i32 %zv1e0, %b8
3312 %v1e3 = extractelement <4 x i8> %vec1, i64 3
3313 %zv1e3 = zext i8 %v1e3 to i32
3314 %b16 = add i32 %base, 16
3315 %byte2 = shl i32 %zv1e3, %b16
3316 %v2e3 = extractelement <4 x i8> %vec2, i64 3
3317 %zv2e3 = zext i8 %v2e3 to i32
3318 %b24 = add i32 %base, 24
3319 %byte3 = shl i32 %zv2e3, %b24
3321 %tmp0 = or i32 %zv1e0, %byte1
3322 %tmp1 = or i32 %tmp0, %byte2
3323 %res = or i32 %tmp1, %byte3
3324 store i32 %res, ptr addrspace(1) %out0, align 4
3328 ; TODO -- lower into v_perm
3329 define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) {
3330 ; GFX10-LABEL: extractbase:
3332 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3334 ; GFX10-NEXT: global_load_dword v8, v[2:3], off
3335 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6
3336 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v0
3337 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3338 ; GFX10-NEXT: v_bfe_u32 v2, v7, v1, 8
3339 ; GFX10-NEXT: v_bfe_u32 v0, v7, v0, 8
3340 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3341 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3342 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3343 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v0
3344 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
3345 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3346 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3348 ; GFX9-LABEL: extractbase:
3350 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3351 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3352 ; GFX9-NEXT: global_load_dword v8, v[2:3], off
3353 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v6
3354 ; GFX9-NEXT: v_add_u32_e32 v1, 24, v0
3355 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3356 ; GFX9-NEXT: v_bfe_u32 v0, v7, v0, 8
3357 ; GFX9-NEXT: v_bfe_u32 v2, v7, v1, 8
3358 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3359 ; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3360 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3361 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
3362 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
3363 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3365 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3366 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
3367 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
3368 %v1b = extractelement <4 x i8> %vec1, i64 %base
3369 %zv1b = zext i8 %v1b to i32
3370 %byte1 = shl i32 %zv1b, 8
3372 %b3 = add i64 %base, 3
3373 %v1b3 = extractelement <4 x i8> %vec1, i64 %b3
3374 %zv1b3 = zext i8 %v1b3 to i32
3375 %byte2 = shl i32 %zv1b3, 16
3376 %v2b3 = extractelement <4 x i8> %vec2, i64 %b3
3377 %zv2b3 = zext i8 %v2b3 to i32
3378 %byte3 = shl i32 %zv2b3, 24
3380 %tmp0 = or i32 %zv1b, %byte1
3381 %tmp1 = or i32 %tmp0, %byte2
3382 %res = or i32 %tmp1, %byte3
3383 store i32 %res, ptr addrspace(1) %out0, align 4
3387 define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3388 ; GFX10-LABEL: extract_hilo:
3390 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3391 ; GFX10-NEXT: global_load_dword v6, v[2:3], off
3392 ; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
3393 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3394 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3060505
3395 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3396 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3398 ; GFX9-LABEL: extract_hilo:
3400 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3401 ; GFX9-NEXT: global_load_dword v6, v[2:3], off
3402 ; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4
3403 ; GFX9-NEXT: s_mov_b32 s4, 0x3060505
3404 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3405 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3406 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3407 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3408 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3409 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3410 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3411 %v1e5 = extractelement <8 x i8> %vec1, i64 5
3412 %zv1e5 = zext i8 %v1e5 to i32
3413 %byte1 = shl i32 %zv1e5, 8
3415 %v1e6 = extractelement <8 x i8> %vec1, i64 6
3416 %zv1e6 = zext i8 %v1e6 to i32
3417 %byte2 = shl i32 %zv1e6, 16
3418 %v2e3 = extractelement <8 x i8> %vec2, i64 3
3419 %zv2e3 = zext i8 %v2e3 to i32
3420 %byte3 = shl i32 %zv2e3, 24
3422 %tmp0 = or i32 %zv1e5, %byte1
3423 %tmp1 = or i32 %tmp0, %byte2
3424 %res = or i32 %tmp1, %byte3
3425 store i32 %res, ptr addrspace(1) %out0, align 4
3429 define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3430 ; GFX10-LABEL: extract_lohi:
3432 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3433 ; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4
3434 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3435 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3436 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x70404
3437 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3438 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3440 ; GFX9-LABEL: extract_lohi:
3442 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3443 ; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4
3444 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3445 ; GFX9-NEXT: s_mov_b32 s4, 0x70404
3446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3447 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3448 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3449 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3450 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3451 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3452 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3453 %v1e0 = extractelement <8 x i8> %vec1, i64 0
3454 %zv1e0 = zext i8 %v1e0 to i32
3455 %byte1 = shl i32 %zv1e0, 8
3457 %v1e3 = extractelement <8 x i8> %vec1, i64 3
3458 %zv1e3 = zext i8 %v1e3 to i32
3459 %byte2 = shl i32 %zv1e3, 16
3460 %v2e4 = extractelement <8 x i8> %vec2, i64 4
3461 %zv2e4 = zext i8 %v2e4 to i32
3462 %byte3 = shl i32 %zv2e4, 24
3464 %tmp0 = or i32 %zv1e0, %byte1
3465 %tmp1 = or i32 %tmp0, %byte2
3466 %res = or i32 %tmp1, %byte3
3467 store i32 %res, ptr addrspace(1) %out0, align 4
3471 define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3472 ; GFX10-LABEL: extract_hihi:
3474 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3475 ; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4
3476 ; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
3477 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3478 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x2070505
3479 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3480 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3482 ; GFX9-LABEL: extract_hihi:
3484 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3485 ; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4
3486 ; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4
3487 ; GFX9-NEXT: s_mov_b32 s4, 0x2070505
3488 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3489 ; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
3490 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3491 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3492 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3493 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3494 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3495 %v1e5 = extractelement <8 x i8> %vec1, i64 5
3496 %zv1e5 = zext i8 %v1e5 to i32
3497 %byte1 = shl i32 %zv1e5, 8
3499 %v1e7 = extractelement <8 x i8> %vec1, i64 7
3500 %zv1e7 = zext i8 %v1e7 to i32
3501 %byte2 = shl i32 %zv1e7, 16
3502 %v2e6 = extractelement <8 x i8> %vec2, i64 6
3503 %zv2e6 = zext i8 %v2e6 to i32
3504 %byte3 = shl i32 %zv2e6, 24
3506 %tmp0 = or i32 %zv1e5, %byte1
3507 %tmp1 = or i32 %tmp0, %byte2
3508 %res = or i32 %tmp1, %byte3
3509 store i32 %res, ptr addrspace(1) %out0, align 4
3513 define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
3514 ; GFX10-LABEL: extract_v8i8:
3516 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3517 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3518 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3519 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x1070404
3520 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
3521 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3523 ; GFX9-LABEL: extract_v8i8:
3525 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3526 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3527 ; GFX9-NEXT: s_mov_b32 s4, 0x1070404
3528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3529 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
3530 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
3531 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3532 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3533 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3534 %v1e4 = extractelement <8 x i8> %vec1, i64 4
3535 %zv1e4 = zext i8 %v1e4 to i32
3536 %byte1 = shl i32 %zv1e4, 8
3538 %v1e7 = extractelement <8 x i8> %vec1, i64 7
3539 %zv1e7 = zext i8 %v1e7 to i32
3540 %byte2 = shl i32 %zv1e7, 16
3541 %v2e1 = extractelement <8 x i8> %vec1, i64 1
3542 %zv2e1 = zext i8 %v2e1 to i32
3543 %byte3 = shl i32 %zv2e1, 24
3545 %tmp0 = or i32 %zv1e4, %byte1
3546 %tmp1 = or i32 %tmp0, %byte2
3547 %res = or i32 %tmp1, %byte3
3548 store i32 %res, ptr addrspace(1) %out0, align 4
3552 define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
3553 ; GFX10-LABEL: extract_v256i8:
3555 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3556 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:252
3557 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3558 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6050707
3559 ; GFX10-NEXT: global_store_dword v[2:3], v0, off
3560 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3562 ; GFX9-LABEL: extract_v256i8:
3564 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3565 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:252
3566 ; GFX9-NEXT: s_mov_b32 s4, 0x6050707
3567 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3568 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
3569 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
3570 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3571 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3572 %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4
3573 %v1e4 = extractelement <256 x i8> %vec1, i64 255
3574 %zv1e4 = zext i8 %v1e4 to i32
3575 %byte1 = shl i32 %zv1e4, 8
3577 %v1e7 = extractelement <256 x i8> %vec1, i64 253
3578 %zv1e7 = zext i8 %v1e7 to i32
3579 %byte2 = shl i32 %zv1e7, 16
3580 %v2e1 = extractelement <256 x i8> %vec1, i64 254
3581 %zv2e1 = zext i8 %v2e1 to i32
3582 %byte3 = shl i32 %zv2e1, 24
3584 %tmp0 = or i32 %zv1e4, %byte1
3585 %tmp1 = or i32 %tmp0, %byte2
3586 %res = or i32 %tmp1, %byte3
3587 store i32 %res, ptr addrspace(1) %out0, align 4
3591 ; TODO : support this pattern
3592 define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3593 ; GFX10-LABEL: extract_3src:
3595 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
3597 ; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4
3598 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3599 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7
3600 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3601 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8
3602 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6
3603 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0
3604 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1
3605 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2
3606 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
3607 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3608 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3610 ; GFX9-LABEL: extract_3src:
3612 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3613 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
3614 ; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4
3615 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3616 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6
3617 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
3618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3619 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8
3620 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1
3621 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2
3622 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
3623 ; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
3624 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3625 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3626 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3627 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3628 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3629 %v1e0 = extractelement <8 x i8> %vec1, i64 0
3630 %zv1e0 = zext i8 %v1e0 to i32
3631 %byte1 = shl i32 %zv1e0, 8
3633 %v1e5 = extractelement <8 x i8> %vec1, i64 5
3634 %zv1e5 = zext i8 %v1e5 to i32
3635 %byte2 = shl i32 %zv1e5, 16
3636 %v2e6 = extractelement <8 x i8> %vec2, i64 6
3637 %zv2e6 = zext i8 %v2e6 to i32
3638 %byte3 = shl i32 %zv2e6, 24
3640 %tmp0 = or i32 %zv1e0, %byte1
3641 %tmp1 = or i32 %tmp0, %byte2
3642 %res = or i32 %tmp1, %byte3
3643 store i32 %res, ptr addrspace(1) %out0, align 4
3647 ; Should not result in crash
3648 define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3649 ; GFX10-LABEL: extract_v6i16:
3651 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3652 ; GFX10-NEXT: s_clause 0x3
3653 ; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6
3654 ; GFX10-NEXT: global_load_ushort v3, v[0:1], off
3655 ; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2
3656 ; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4
3657 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3658 ; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3
3659 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3660 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9
3661 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3662 ; GFX10-NEXT: global_store_dword v[6:7], v1, off
3663 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3665 ; GFX9-LABEL: extract_v6i16:
3667 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:6
3669 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off
3670 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4
3671 ; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:2
3672 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3673 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v8
3674 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3675 ; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v3
3676 ; GFX9-NEXT: global_store_dword v[4:5], v1, off
3677 ; GFX9-NEXT: global_store_dword v[6:7], v0, off
3678 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3679 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3680 %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2
3681 %el0 = extractelement <6 x i16> %vec, i32 0
3682 %el1 = extractelement <6 x i16> %vec, i32 1
3683 %el2 = extractelement <6 x i16> %vec, i32 2
3684 %el3 = extractelement <6 x i16> %vec, i32 3
3685 %z0 = zext i16 %el0 to i32
3686 %z1 = zext i16 %el1 to i32
3687 %s1 = shl nuw i32 %z1, 16
3688 %o0 = or i32 %s1, %z0
3689 %z2 = zext i16 %el2 to i32
3690 %z3 = zext i16 %el3 to i32
3691 %s3 = shl nuw i32 %z3, 16
3692 %o1 = or i32 %z2, %s3
3694 store i32 %o0, ptr addrspace(1) %out0, align 4
3695 store i32 %o1, ptr addrspace(1) %out1, align 4
3700 ; Should not result in crash
3701 define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3702 ; GFX10-LABEL: extract_v7i16:
3704 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3705 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3706 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3707 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3708 ; GFX10-NEXT: global_store_dword v[6:7], v1, off
3709 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3711 ; GFX9-LABEL: extract_v7i16:
3713 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3714 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3715 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3716 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3717 ; GFX9-NEXT: global_store_dword v[6:7], v1, off
3718 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3719 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3720 %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2
3721 %el0 = extractelement <7 x i16> %vec, i32 0
3722 %el1 = extractelement <7 x i16> %vec, i32 1
3723 %el2 = extractelement <7 x i16> %vec, i32 2
3724 %el3 = extractelement <7 x i16> %vec, i32 3
3725 %z0 = zext i16 %el0 to i32
3726 %z1 = zext i16 %el1 to i32
3727 %s1 = shl nuw i32 %z1, 16
3728 %o0 = or i32 %s1, %z0
3729 %z2 = zext i16 %el2 to i32
3730 %z3 = zext i16 %el3 to i32
3731 %s3 = shl nuw i32 %z3, 16
3732 %o1 = or i32 %z2, %s3
3734 store i32 %o0, ptr addrspace(1) %out0, align 4
3735 store i32 %o1, ptr addrspace(1) %out1, align 4
3739 ; Should not result in crash
3740 define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3741 ; GFX10-LABEL: extract_v13i8:
3743 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3744 ; GFX10-NEXT: s_clause 0x1
3745 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
3746 ; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:8
3747 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3748 ; GFX10-NEXT: v_bfe_u32 v0, v2, 8, 8
3749 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3750 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v8
3751 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040c00
3752 ; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040c03
3753 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3754 ; GFX10-NEXT: global_store_dword v[6:7], v1, off
3755 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3757 ; GFX9-LABEL: extract_v13i8:
3759 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3760 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
3761 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
3762 ; GFX9-NEXT: s_mov_b32 s4, 0x5040c00
3763 ; GFX9-NEXT: s_mov_b32 s5, 0x5040c03
3764 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3765 ; GFX9-NEXT: v_bfe_u32 v0, v2, 8, 8
3766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3767 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v8
3768 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
3769 ; GFX9-NEXT: v_perm_b32 v1, v1, v3, s5
3770 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3771 ; GFX9-NEXT: global_store_dword v[6:7], v1, off
3772 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3773 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3774 %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2
3775 %el0 = extractelement <13 x i8> %vec, i32 0
3776 %el1 = extractelement <13 x i8> %vec, i32 1
3777 %el2 = extractelement <13 x i8> %vec, i32 7
3778 %el3 = extractelement <13 x i8> %vec, i32 8
3779 %z0 = zext i8 %el0 to i32
3780 %z1 = zext i8 %el1 to i32
3781 %s1 = shl nuw i32 %z1, 16
3782 %o0 = or i32 %s1, %z0
3783 %z2 = zext i8 %el2 to i32
3784 %z3 = zext i8 %el3 to i32
3785 %s3 = shl nuw i32 %z3, 16
3786 %o1 = or i32 %z2, %s3
3788 store i32 %o0, ptr addrspace(1) %out0, align 4
3789 store i32 %o1, ptr addrspace(1) %out1, align 4
3793 ; Should not result in crash
3794 define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3795 ; GFX10-LABEL: extract_v13i64:
3797 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3798 ; GFX10-NEXT: s_clause 0x2
3799 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
3800 ; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
3801 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
3802 ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
3803 ; GFX10-NEXT: s_waitcnt vmcnt(1)
3804 ; GFX10-NEXT: v_perm_b32 v0, v12, v13, 0x1000504
3805 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3806 ; GFX10-NEXT: v_perm_b32 v1, v10, v14, 0x1000504
3807 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
3808 ; GFX10-NEXT: global_store_dword v[6:7], v1, off
3809 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3811 ; GFX9-LABEL: extract_v13i64:
3813 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3814 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
3815 ; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
3816 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
3817 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3818 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
3819 ; GFX9-NEXT: s_waitcnt vmcnt(1)
3820 ; GFX9-NEXT: v_perm_b32 v0, v12, v13, s4
3821 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3822 ; GFX9-NEXT: v_perm_b32 v1, v10, v14, s4
3823 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3824 ; GFX9-NEXT: global_store_dword v[6:7], v1, off
3825 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3826 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3827 %vec = load <13 x i64>, ptr addrspace(1) %in0, align 2
3828 %el0 = extractelement <13 x i64> %vec, i32 0
3829 %el1 = extractelement <13 x i64> %vec, i32 1
3830 %el2 = extractelement <13 x i64> %vec, i32 7
3831 %el3 = extractelement <13 x i64> %vec, i32 8
3832 %el00 = lshr i64 %el0, 32
3833 %t0 = trunc i64 %el00 to i16
3834 %z0 = zext i16 %t0 to i32
3835 %z1 = trunc i64 %el1 to i32
3836 %s1 = shl nuw i32 %z1, 16
3837 %o0 = or i32 %s1, %z0
3838 %t2 = trunc i64 %el2 to i16
3839 %z2 = zext i16 %t2 to i32
3840 %z3 = trunc i64 %el3 to i32
3841 %s3 = shl nuw i32 %z3, 16
3842 %o1 = or i32 %z2, %s3
3844 store i32 %o0, ptr addrspace(1) %out0, align 4
3845 store i32 %o1, ptr addrspace(1) %out1, align 4
3850 ; Should combine the lower 16 bits from each i32 in load
3851 define hidden void @trunc_vector(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3852 ; GFX10-LABEL: trunc_vector:
3854 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3855 ; GFX10-NEXT: s_clause 0x1
3856 ; GFX10-NEXT: global_load_ushort v2, v[0:1], off
3857 ; GFX10-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:4
3858 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3859 ; GFX10-NEXT: global_store_dword v[4:5], v2, off
3860 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3862 ; GFX9-LABEL: trunc_vector:
3864 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3865 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off
3866 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:4
3867 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3868 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3869 ; GFX9-NEXT: v_perm_b32 v0, v3, v2, s4
3870 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
3871 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3872 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3873 %vec = load <2 x i32>, ptr addrspace(1) %in0, align 2
3874 %tvec = trunc <2 x i32> %vec to <2 x i16>
3875 %el0 = extractelement <2 x i16> %tvec, i32 0
3876 %el1 = extractelement <2 x i16> %tvec, i32 1
3877 %z0 = zext i16 %el0 to i32
3878 %z1 = zext i16 %el1 to i32
3879 %s1 = shl nuw i32 %z1, 16
3880 %o0 = or i32 %s1, %z0
3882 store i32 %o0, ptr addrspace(1) %out0, align 4