1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-NOHSA %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCNX3-HSA %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
9 define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
10 ; SI-NOHSA-LABEL: global_load_i32:
11 ; SI-NOHSA: ; %bb.0: ; %entry
12 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
13 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
14 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
15 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
16 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
17 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
19 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
20 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
21 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
22 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
23 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
25 ; SI-NOHSA-NEXT: s_endpgm
27 ; GCNX3-HSA-LABEL: global_load_i32:
28 ; GCNX3-HSA: ; %bb.0: ; %entry
29 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
30 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
31 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
33 ; GCNX3-HSA-NEXT: flat_load_dword v2, v[0:1]
34 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
35 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
36 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
37 ; GCNX3-HSA-NEXT: flat_store_dword v[0:1], v2
38 ; GCNX3-HSA-NEXT: s_endpgm
40 ; GCNX3-NOHSA-LABEL: global_load_i32:
41 ; GCNX3-NOHSA: ; %bb.0: ; %entry
42 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
43 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
44 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
45 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
46 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
47 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
48 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
49 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
50 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
51 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
52 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
53 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
54 ; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
55 ; GCNX3-NOHSA-NEXT: s_endpgm
57 ; EG-LABEL: global_load_i32:
58 ; EG: ; %bb.0: ; %entry
59 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
61 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
62 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
65 ; EG-NEXT: Fetch clause starting at 6:
66 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
67 ; EG-NEXT: ALU clause starting at 8:
68 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
69 ; EG-NEXT: ALU clause starting at 9:
70 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
71 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
73 ; GCN-HSA-LABEL: global_load_i32:
74 ; GCN-HSA: ; %bb.0: ; %entry
75 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
76 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0
77 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
78 ; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3]
79 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
80 ; GCN-HSA-NEXT: global_store_dword v0, v1, s[0:1]
81 ; GCN-HSA-NEXT: s_endpgm
83 %ld = load i32, ptr addrspace(1) %in
84 store i32 %ld, ptr addrspace(1) %out
88 define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
89 ; SI-NOHSA-LABEL: global_load_v2i32:
90 ; SI-NOHSA: ; %bb.0: ; %entry
91 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
92 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
93 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
94 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
95 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
96 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
97 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
98 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
99 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
100 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
101 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
102 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
103 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
104 ; SI-NOHSA-NEXT: s_endpgm
106 ; GCNX3-HSA-LABEL: global_load_v2i32:
107 ; GCNX3-HSA: ; %bb.0: ; %entry
108 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
109 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
110 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
111 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
112 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
113 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
114 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
115 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
116 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
117 ; GCNX3-HSA-NEXT: s_endpgm
119 ; GCNX3-NOHSA-LABEL: global_load_v2i32:
120 ; GCNX3-NOHSA: ; %bb.0: ; %entry
121 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
122 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
123 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
124 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
125 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
126 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
127 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
128 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
129 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
130 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
131 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
132 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
133 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
134 ; GCNX3-NOHSA-NEXT: s_endpgm
136 ; EG-LABEL: global_load_v2i32:
137 ; EG: ; %bb.0: ; %entry
138 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
140 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
141 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
144 ; EG-NEXT: Fetch clause starting at 6:
145 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
146 ; EG-NEXT: ALU clause starting at 8:
147 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
148 ; EG-NEXT: ALU clause starting at 9:
149 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
150 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
152 ; GCN-HSA-LABEL: global_load_v2i32:
153 ; GCN-HSA: ; %bb.0: ; %entry
154 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
155 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
156 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
157 ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
158 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
159 ; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
160 ; GCN-HSA-NEXT: s_endpgm
162 %ld = load <2 x i32>, ptr addrspace(1) %in
163 store <2 x i32> %ld, ptr addrspace(1) %out
167 define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
168 ; SI-NOHSA-LABEL: global_load_v3i32:
169 ; SI-NOHSA: ; %bb.0: ; %entry
170 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
171 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
172 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
173 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
174 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
175 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
176 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
177 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
178 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
179 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
180 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
181 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
182 ; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
183 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
184 ; SI-NOHSA-NEXT: s_endpgm
186 ; GCNX3-HSA-LABEL: global_load_v3i32:
187 ; GCNX3-HSA: ; %bb.0: ; %entry
188 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
189 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
190 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
191 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
192 ; GCNX3-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
193 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s0
194 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s1
195 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
196 ; GCNX3-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
197 ; GCNX3-HSA-NEXT: s_endpgm
199 ; GCNX3-NOHSA-LABEL: global_load_v3i32:
200 ; GCNX3-NOHSA: ; %bb.0: ; %entry
201 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
202 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
203 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
204 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
205 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
206 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
207 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
208 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
209 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
210 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
211 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
212 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
213 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
214 ; GCNX3-NOHSA-NEXT: s_endpgm
216 ; EG-LABEL: global_load_v3i32:
217 ; EG: ; %bb.0: ; %entry
218 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
220 ; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
221 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
222 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
224 ; EG-NEXT: Fetch clause starting at 6:
225 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
226 ; EG-NEXT: ALU clause starting at 8:
227 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
228 ; EG-NEXT: ALU clause starting at 9:
229 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
230 ; EG-NEXT: MOV * T2.X, T0.Z,
231 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
232 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
233 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
234 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
235 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
237 ; GCN-HSA-LABEL: global_load_v3i32:
238 ; GCN-HSA: ; %bb.0: ; %entry
239 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
240 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0
241 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
242 ; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
243 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
244 ; GCN-HSA-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
245 ; GCN-HSA-NEXT: s_endpgm
247 %ld = load <3 x i32>, ptr addrspace(1) %in
248 store <3 x i32> %ld, ptr addrspace(1) %out
252 define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
253 ; SI-NOHSA-LABEL: global_load_v4i32:
254 ; SI-NOHSA: ; %bb.0: ; %entry
255 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
256 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
257 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
258 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
259 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
260 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
261 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
262 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
263 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
264 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
265 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
266 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
267 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
268 ; SI-NOHSA-NEXT: s_endpgm
270 ; GCNX3-HSA-LABEL: global_load_v4i32:
271 ; GCNX3-HSA: ; %bb.0: ; %entry
272 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
273 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
274 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
275 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
276 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
277 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
278 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
279 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
280 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
281 ; GCNX3-HSA-NEXT: s_endpgm
283 ; GCNX3-NOHSA-LABEL: global_load_v4i32:
284 ; GCNX3-NOHSA: ; %bb.0: ; %entry
285 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
286 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
287 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
288 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
289 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
290 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
291 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
292 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
293 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
294 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
295 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
296 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
297 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
298 ; GCNX3-NOHSA-NEXT: s_endpgm
300 ; EG-LABEL: global_load_v4i32:
301 ; EG: ; %bb.0: ; %entry
302 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
304 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
305 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
308 ; EG-NEXT: Fetch clause starting at 6:
309 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
310 ; EG-NEXT: ALU clause starting at 8:
311 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
312 ; EG-NEXT: ALU clause starting at 9:
313 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
314 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
316 ; GCN-HSA-LABEL: global_load_v4i32:
317 ; GCN-HSA: ; %bb.0: ; %entry
318 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
319 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
320 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
321 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
322 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
323 ; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
324 ; GCN-HSA-NEXT: s_endpgm
326 %ld = load <4 x i32>, ptr addrspace(1) %in
327 store <4 x i32> %ld, ptr addrspace(1) %out
331 define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
332 ; SI-NOHSA-LABEL: global_load_v8i32:
333 ; SI-NOHSA: ; %bb.0: ; %entry
334 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
335 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
336 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
337 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
338 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
339 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
340 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
341 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
342 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
343 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
344 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
345 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
346 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
347 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
348 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
349 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
350 ; SI-NOHSA-NEXT: s_endpgm
352 ; GCNX3-HSA-LABEL: global_load_v8i32:
353 ; GCNX3-HSA: ; %bb.0: ; %entry
354 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
355 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
356 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
357 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
358 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
359 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
360 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
361 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
362 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
363 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
364 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
365 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
366 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
367 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
368 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
369 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0
370 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
371 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
372 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
373 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
374 ; GCNX3-HSA-NEXT: s_endpgm
376 ; GCNX3-NOHSA-LABEL: global_load_v8i32:
377 ; GCNX3-NOHSA: ; %bb.0: ; %entry
378 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
379 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
380 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
381 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
382 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
383 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
384 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
385 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
386 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
387 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
388 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
389 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
390 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
391 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
392 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
393 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
394 ; GCNX3-NOHSA-NEXT: s_endpgm
396 ; EG-LABEL: global_load_v8i32:
397 ; EG: ; %bb.0: ; %entry
398 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
400 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
401 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
402 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
404 ; EG-NEXT: Fetch clause starting at 6:
405 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
406 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
407 ; EG-NEXT: ALU clause starting at 10:
408 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
409 ; EG-NEXT: ALU clause starting at 11:
410 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
411 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
412 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
413 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
414 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
416 ; GCN-HSA-LABEL: global_load_v8i32:
417 ; GCN-HSA: ; %bb.0: ; %entry
418 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
419 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
420 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
421 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
422 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
423 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
424 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
425 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
426 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
427 ; GCN-HSA-NEXT: s_endpgm
429 %ld = load <8 x i32>, ptr addrspace(1) %in
430 store <8 x i32> %ld, ptr addrspace(1) %out
434 define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
435 ; SI-NOHSA-LABEL: global_load_v9i32:
436 ; SI-NOHSA: ; %bb.0: ; %entry
437 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
438 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
439 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
440 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
441 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
442 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
444 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
445 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
446 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
447 ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
448 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
449 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
450 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
451 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
452 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
453 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
454 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
455 ; SI-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
456 ; SI-NOHSA-NEXT: s_endpgm
458 ; GCNX3-HSA-LABEL: global_load_v9i32:
459 ; GCNX3-HSA: ; %bb.0: ; %entry
460 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
461 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
462 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
463 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
464 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
465 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
466 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
467 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
468 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
469 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
470 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
471 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
472 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
473 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
474 ; GCNX3-HSA-NEXT: flat_load_dword v14, v[8:9]
475 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
476 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
477 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
478 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
479 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
480 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
481 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s3
482 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
483 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s2
484 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
485 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
486 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
487 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
488 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
489 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
490 ; GCNX3-HSA-NEXT: flat_store_dword v[12:13], v14
491 ; GCNX3-HSA-NEXT: s_endpgm
493 ; GCNX3-NOHSA-LABEL: global_load_v9i32:
494 ; GCNX3-NOHSA: ; %bb.0: ; %entry
495 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
496 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
497 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
498 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
499 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
500 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
501 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
502 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
503 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
504 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
505 ; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
506 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
507 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
508 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
509 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
510 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
511 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
512 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
513 ; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
514 ; GCNX3-NOHSA-NEXT: s_endpgm
516 ; EG-LABEL: global_load_v9i32:
517 ; EG: ; %bb.0: ; %entry
518 ; EG-NEXT: ALU 8, @14, KC0[CB0:0-32], KC1[]
520 ; EG-NEXT: ALU 1, @23, KC0[CB0:0-32], KC1[]
521 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0
522 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
523 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T0.X, 1
526 ; EG-NEXT: Fetch clause starting at 8:
527 ; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 0, #1
528 ; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 16, #1
529 ; EG-NEXT: VTX_READ_32 T3.X, T3.X, 32, #1
530 ; EG-NEXT: ALU clause starting at 14:
531 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
532 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
533 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
534 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
535 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
536 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
537 ; EG-NEXT: MOV * T2.X, KC0[2].Z,
538 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
539 ; EG-NEXT: MOV * T3.X, PS,
540 ; EG-NEXT: ALU clause starting at 23:
541 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
542 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
544 ; GCN-HSA-LABEL: global_load_v9i32:
545 ; GCN-HSA: ; %bb.0: ; %entry
546 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
547 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
548 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
549 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
550 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
551 ; GCN-HSA-NEXT: global_load_dword v9, v8, s[2:3] offset:32
552 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
553 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
554 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
555 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
556 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
557 ; GCN-HSA-NEXT: global_store_dword v8, v9, s[0:1] offset:32
558 ; GCN-HSA-NEXT: s_endpgm
560 %ld = load <9 x i32>, ptr addrspace(1) %in
561 store <9 x i32> %ld, ptr addrspace(1) %out
565 define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
566 ; SI-NOHSA-LABEL: global_load_v10i32:
567 ; SI-NOHSA: ; %bb.0: ; %entry
568 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
569 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
570 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
571 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
572 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
573 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
574 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
575 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
576 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
577 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
578 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
579 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
580 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
581 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
582 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
583 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
584 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
585 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
586 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
587 ; SI-NOHSA-NEXT: s_endpgm
589 ; GCNX3-HSA-LABEL: global_load_v10i32:
590 ; GCNX3-HSA: ; %bb.0: ; %entry
591 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
592 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
593 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
594 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
595 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
596 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
597 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
598 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
599 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
600 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
601 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
602 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
603 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
604 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
605 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
606 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
607 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
608 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
609 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0
610 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
611 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
612 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
613 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1
614 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
615 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s0
616 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
617 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
618 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
619 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
620 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
621 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[14:15], v[8:9]
622 ; GCNX3-HSA-NEXT: s_endpgm
624 ; GCNX3-NOHSA-LABEL: global_load_v10i32:
625 ; GCNX3-NOHSA: ; %bb.0: ; %entry
626 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
627 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
628 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
629 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
630 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
631 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
632 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
633 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
634 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
635 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
636 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
637 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
638 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
639 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
640 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
641 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
642 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
643 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
644 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
645 ; GCNX3-NOHSA-NEXT: s_endpgm
647 ; EG-LABEL: global_load_v10i32:
648 ; EG: ; %bb.0: ; %entry
649 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
651 ; EG-NEXT: ALU 7, @15, KC0[CB0:0-32], KC1[]
652 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0
653 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
654 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
657 ; EG-NEXT: Fetch clause starting at 8:
658 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
659 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
660 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1
661 ; EG-NEXT: ALU clause starting at 14:
662 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
663 ; EG-NEXT: ALU clause starting at 15:
664 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
665 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
666 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
667 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
668 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
669 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
670 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
671 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
673 ; GCN-HSA-LABEL: global_load_v10i32:
674 ; GCN-HSA: ; %bb.0: ; %entry
675 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
676 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0
677 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
678 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3]
679 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v10, s[2:3] offset:16
680 ; GCN-HSA-NEXT: global_load_dwordx2 v[8:9], v10, s[2:3] offset:32
681 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
682 ; GCN-HSA-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
683 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
684 ; GCN-HSA-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
685 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
686 ; GCN-HSA-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
687 ; GCN-HSA-NEXT: s_endpgm
689 %ld = load <10 x i32>, ptr addrspace(1) %in
690 store <10 x i32> %ld, ptr addrspace(1) %out
694 define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
695 ; SI-NOHSA-LABEL: global_load_v11i32:
696 ; SI-NOHSA: ; %bb.0: ; %entry
697 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
698 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
699 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
700 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
701 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
702 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
703 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
704 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
705 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
706 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
707 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
708 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
709 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
710 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
711 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
712 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
713 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
714 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
715 ; SI-NOHSA-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:40
716 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
717 ; SI-NOHSA-NEXT: s_endpgm
719 ; GCNX3-HSA-LABEL: global_load_v11i32:
720 ; GCNX3-HSA: ; %bb.0: ; %entry
721 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
722 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
723 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
724 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
725 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
726 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
727 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
728 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
729 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
730 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
731 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
732 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
733 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
734 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
735 ; GCNX3-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9]
736 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
737 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
738 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
739 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0
740 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
741 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
742 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3
743 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1
744 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2
745 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s0
746 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
747 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
748 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
749 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7]
750 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
751 ; GCNX3-HSA-NEXT: flat_store_dwordx3 v[15:16], v[8:10]
752 ; GCNX3-HSA-NEXT: s_endpgm
754 ; GCNX3-NOHSA-LABEL: global_load_v11i32:
755 ; GCNX3-NOHSA: ; %bb.0: ; %entry
756 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
757 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
758 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
759 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
760 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
761 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
762 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
763 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
764 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
765 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
766 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
767 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
768 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
769 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
770 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
771 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
772 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
773 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
774 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
775 ; GCNX3-NOHSA-NEXT: s_endpgm
777 ; EG-LABEL: global_load_v11i32:
778 ; EG: ; %bb.0: ; %entry
779 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
781 ; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[]
782 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
783 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
784 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
785 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
787 ; EG-NEXT: Fetch clause starting at 8:
788 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
789 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
790 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1
791 ; EG-NEXT: ALU clause starting at 14:
792 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
793 ; EG-NEXT: ALU clause starting at 15:
794 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
795 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
796 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
797 ; EG-NEXT: MOV * T4.X, T0.Z,
798 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
799 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
800 ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
801 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
802 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
803 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
804 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
805 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
806 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
808 ; GCN-HSA-LABEL: global_load_v11i32:
809 ; GCN-HSA: ; %bb.0: ; %entry
810 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
811 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0
812 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
813 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3]
814 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v11, s[2:3] offset:16
815 ; GCN-HSA-NEXT: global_load_dwordx3 v[8:10], v11, s[2:3] offset:32
816 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
817 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1]
818 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
819 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] offset:16
820 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
821 ; GCN-HSA-NEXT: global_store_dwordx3 v11, v[8:10], s[0:1] offset:32
822 ; GCN-HSA-NEXT: s_endpgm
824 %ld = load <11 x i32>, ptr addrspace(1) %in
825 store <11 x i32> %ld, ptr addrspace(1) %out
830 define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
831 ; SI-NOHSA-LABEL: global_load_v12i32:
832 ; SI-NOHSA: ; %bb.0: ; %entry
833 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
834 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
835 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
836 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
837 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
838 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
839 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
840 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
841 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
842 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
843 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
844 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
845 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
846 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
847 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
848 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
849 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
850 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
851 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
852 ; SI-NOHSA-NEXT: s_endpgm
854 ; GCNX3-HSA-LABEL: global_load_v12i32:
855 ; GCNX3-HSA: ; %bb.0: ; %entry
856 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
857 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
858 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
859 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
860 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
861 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
862 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
863 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
864 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
865 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
866 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
867 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
868 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
869 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
870 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
871 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
872 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
873 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
874 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
875 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
876 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
877 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
878 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
879 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
880 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
881 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
882 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
883 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
884 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
885 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
886 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
887 ; GCNX3-HSA-NEXT: s_endpgm
889 ; GCNX3-NOHSA-LABEL: global_load_v12i32:
890 ; GCNX3-NOHSA: ; %bb.0: ; %entry
891 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
892 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
893 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
894 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
895 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
896 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
897 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
898 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
899 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
900 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
901 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
902 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
903 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
904 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
905 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
906 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
907 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
908 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
909 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
910 ; GCNX3-NOHSA-NEXT: s_endpgm
912 ; EG-LABEL: global_load_v12i32:
913 ; EG: ; %bb.0: ; %entry
914 ; EG-NEXT: ALU 7, @14, KC0[CB0:0-32], KC1[]
916 ; EG-NEXT: ALU 1, @22, KC0[CB0:0-32], KC1[]
917 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
918 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
919 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
922 ; EG-NEXT: Fetch clause starting at 8:
923 ; EG-NEXT: VTX_READ_128 T3.XYZW, T2.X, 0, #1
924 ; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 16, #1
925 ; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 32, #1
926 ; EG-NEXT: ALU clause starting at 14:
927 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
928 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
929 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
930 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
931 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
932 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
933 ; EG-NEXT: MOV * T2.X, KC0[2].Z,
934 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
935 ; EG-NEXT: ALU clause starting at 22:
936 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
937 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
939 ; GCN-HSA-LABEL: global_load_v12i32:
940 ; GCN-HSA: ; %bb.0: ; %entry
941 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
942 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0
943 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
944 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3]
945 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
946 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:32
947 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
948 ; GCN-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
949 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
950 ; GCN-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
951 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
952 ; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:32
953 ; GCN-HSA-NEXT: s_endpgm
955 %ld = load <12 x i32>, ptr addrspace(1) %in
956 store <12 x i32> %ld, ptr addrspace(1) %out
960 define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
961 ; SI-NOHSA-LABEL: global_load_v16i32:
962 ; SI-NOHSA: ; %bb.0: ; %entry
963 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
964 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
965 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
966 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
967 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
968 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
969 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
970 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
971 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
972 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
973 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
974 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
975 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
976 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
977 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
978 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
979 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
980 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
981 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
982 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
983 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
984 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
985 ; SI-NOHSA-NEXT: s_endpgm
987 ; GCNX3-HSA-LABEL: global_load_v16i32:
988 ; GCNX3-HSA: ; %bb.0: ; %entry
989 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
990 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
991 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
992 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
993 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
994 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
995 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
996 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
997 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
998 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
999 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 32
1000 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
1001 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
1002 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
1003 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
1004 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
1005 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1006 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1007 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
1008 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
1009 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
1010 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1011 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3
1012 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2
1013 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
1014 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
1015 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1016 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
1017 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
1018 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
1019 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
1020 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
1021 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
1022 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0
1023 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1024 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1025 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1026 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
1027 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1028 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
1029 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1030 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
1031 ; GCNX3-HSA-NEXT: s_endpgm
1033 ; GCNX3-NOHSA-LABEL: global_load_v16i32:
1034 ; GCNX3-NOHSA: ; %bb.0: ; %entry
1035 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1036 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1037 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1038 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1039 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1040 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1041 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1042 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1043 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
1044 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
1045 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
1046 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
1047 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1048 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1049 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1050 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
1051 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1052 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
1053 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1054 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
1055 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1056 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
1057 ; GCNX3-NOHSA-NEXT: s_endpgm
1059 ; EG-LABEL: global_load_v16i32:
1060 ; EG: ; %bb.0: ; %entry
1061 ; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[]
1063 ; EG-NEXT: ALU 1, @28, KC0[], KC1[]
1064 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
1065 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
1066 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
1067 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
1069 ; EG-NEXT: Fetch clause starting at 8:
1070 ; EG-NEXT: VTX_READ_128 T4.XYZW, T3.X, 32, #1
1071 ; EG-NEXT: VTX_READ_128 T5.XYZW, T3.X, 48, #1
1072 ; EG-NEXT: VTX_READ_128 T6.XYZW, T3.X, 0, #1
1073 ; EG-NEXT: VTX_READ_128 T3.XYZW, T3.X, 16, #1
1074 ; EG-NEXT: ALU clause starting at 16:
1075 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1076 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1077 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
1078 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1079 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1080 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1081 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
1082 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1083 ; EG-NEXT: MOV * T3.X, KC0[2].Z,
1084 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1085 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1086 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1087 ; EG-NEXT: ALU clause starting at 28:
1088 ; EG-NEXT: LSHR * T7.X, T0.W, literal.x,
1089 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1091 ; GCN-HSA-LABEL: global_load_v16i32:
1092 ; GCN-HSA: ; %bb.0: ; %entry
1093 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1094 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0
1095 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32
1097 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:48
1098 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3]
1099 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16
1100 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1101 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32
1102 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1103 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:48
1104 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1105 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1]
1106 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1107 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16
1108 ; GCN-HSA-NEXT: s_endpgm
1110 %ld = load <16 x i32>, ptr addrspace(1) %in
1111 store <16 x i32> %ld, ptr addrspace(1) %out
1115 define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1116 ; SI-NOHSA-LABEL: global_zextload_i32_to_i64:
1117 ; SI-NOHSA: ; %bb.0:
1118 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1119 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1120 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1121 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1122 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1123 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1124 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1125 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1126 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1127 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1128 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1129 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1130 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1131 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1132 ; SI-NOHSA-NEXT: s_endpgm
1134 ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
1135 ; GCNX3-HSA: ; %bb.0:
1136 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1137 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1138 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1139 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1140 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1141 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1142 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1143 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
1144 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1145 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1146 ; GCNX3-HSA-NEXT: s_endpgm
1148 ; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
1149 ; GCNX3-NOHSA: ; %bb.0:
1150 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1151 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1152 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1153 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1154 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1155 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1156 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1157 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1158 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1159 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1160 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1161 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1162 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1163 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1164 ; GCNX3-NOHSA-NEXT: s_endpgm
1166 ; EG-LABEL: global_zextload_i32_to_i64:
1168 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1170 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1171 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1174 ; EG-NEXT: Fetch clause starting at 6:
1175 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1176 ; EG-NEXT: ALU clause starting at 8:
1177 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1178 ; EG-NEXT: ALU clause starting at 9:
1179 ; EG-NEXT: MOV * T0.Y, 0.0,
1180 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1181 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1183 ; GCN-HSA-LABEL: global_zextload_i32_to_i64:
1185 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1186 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1187 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1188 ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3]
1189 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1190 ; GCN-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
1191 ; GCN-HSA-NEXT: s_endpgm
1192 %ld = load i32, ptr addrspace(1) %in
1193 %ext = zext i32 %ld to i64
1194 store i64 %ext, ptr addrspace(1) %out
1198 define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1199 ; SI-NOHSA-LABEL: global_sextload_i32_to_i64:
1200 ; SI-NOHSA: ; %bb.0:
1201 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1202 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1203 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1204 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1205 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1206 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1207 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1208 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1209 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1210 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1211 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1212 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1213 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1214 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1215 ; SI-NOHSA-NEXT: s_endpgm
1217 ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
1218 ; GCNX3-HSA: ; %bb.0:
1219 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1220 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1221 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1222 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1223 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1224 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1225 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1226 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1227 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1228 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1229 ; GCNX3-HSA-NEXT: s_endpgm
1231 ; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
1232 ; GCNX3-NOHSA: ; %bb.0:
1233 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1234 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1235 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1236 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1237 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1238 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1239 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1240 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1241 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1242 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1243 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1244 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1245 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1246 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1247 ; GCNX3-NOHSA-NEXT: s_endpgm
1249 ; EG-LABEL: global_sextload_i32_to_i64:
1251 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1253 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1254 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1257 ; EG-NEXT: Fetch clause starting at 6:
1258 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1259 ; EG-NEXT: ALU clause starting at 8:
1260 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1261 ; EG-NEXT: ALU clause starting at 9:
1262 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1263 ; EG-NEXT: ASHR * T0.Y, T0.X, literal.y,
1264 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
1266 ; GCN-HSA-LABEL: global_sextload_i32_to_i64:
1268 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1269 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
1270 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1271 ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3]
1272 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1273 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1274 ; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1275 ; GCN-HSA-NEXT: s_endpgm
1276 %ld = load i32, ptr addrspace(1) %in
1277 %ext = sext i32 %ld to i64
1278 store i64 %ext, ptr addrspace(1) %out
1282 define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1283 ; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
1284 ; SI-NOHSA: ; %bb.0:
1285 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1286 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1287 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1288 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1289 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1290 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1291 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1292 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1293 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1294 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1295 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1296 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1297 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1298 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1299 ; SI-NOHSA-NEXT: s_endpgm
1301 ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
1302 ; GCNX3-HSA: ; %bb.0:
1303 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1304 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1305 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1306 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1307 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1308 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1309 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1310 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
1311 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1312 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1313 ; GCNX3-HSA-NEXT: s_endpgm
1315 ; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
1316 ; GCNX3-NOHSA: ; %bb.0:
1317 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1318 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1319 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1320 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1321 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1322 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1323 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1324 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1325 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1326 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1327 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1328 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1329 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1330 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1331 ; GCNX3-NOHSA-NEXT: s_endpgm
1333 ; EG-LABEL: global_zextload_v1i32_to_v1i64:
1335 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1337 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1338 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1341 ; EG-NEXT: Fetch clause starting at 6:
1342 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1343 ; EG-NEXT: ALU clause starting at 8:
1344 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1345 ; EG-NEXT: ALU clause starting at 9:
1346 ; EG-NEXT: MOV * T0.Y, 0.0,
1347 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1348 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1350 ; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64:
1352 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1353 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1354 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1355 ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3]
1356 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1357 ; GCN-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
1358 ; GCN-HSA-NEXT: s_endpgm
1359 %ld = load <1 x i32>, ptr addrspace(1) %in
1360 %ext = zext <1 x i32> %ld to <1 x i64>
1361 store <1 x i64> %ext, ptr addrspace(1) %out
1365 define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1366 ; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
1367 ; SI-NOHSA: ; %bb.0:
1368 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1369 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1370 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1371 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1372 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1373 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1374 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1375 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1376 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1377 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1378 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1379 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1380 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1381 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1382 ; SI-NOHSA-NEXT: s_endpgm
1384 ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
1385 ; GCNX3-HSA: ; %bb.0:
1386 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1387 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1388 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1389 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1390 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1391 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1392 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1393 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1394 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1395 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1396 ; GCNX3-HSA-NEXT: s_endpgm
1398 ; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
1399 ; GCNX3-NOHSA: ; %bb.0:
1400 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1401 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1402 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1403 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1404 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1405 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1406 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1407 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1408 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1409 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1410 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1411 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1412 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1413 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1414 ; GCNX3-NOHSA-NEXT: s_endpgm
1416 ; EG-LABEL: global_sextload_v1i32_to_v1i64:
1418 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1420 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1421 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1424 ; EG-NEXT: Fetch clause starting at 6:
1425 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1426 ; EG-NEXT: ALU clause starting at 8:
1427 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1428 ; EG-NEXT: ALU clause starting at 9:
1429 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1430 ; EG-NEXT: ASHR * T0.Y, T0.X, literal.y,
1431 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
1433 ; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64:
1435 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1436 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
1437 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1438 ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3]
1439 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1440 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1441 ; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1442 ; GCN-HSA-NEXT: s_endpgm
1443 %ld = load <1 x i32>, ptr addrspace(1) %in
1444 %ext = sext <1 x i32> %ld to <1 x i64>
1445 store <1 x i64> %ext, ptr addrspace(1) %out
1449 define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1450 ; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
1451 ; SI-NOHSA: ; %bb.0:
1452 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1453 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1454 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1455 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1456 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1457 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1458 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1459 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1460 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1461 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1462 ; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1
1463 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1464 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1465 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1466 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
1467 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
1468 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1469 ; SI-NOHSA-NEXT: s_endpgm
1471 ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
1472 ; GCNX3-HSA: ; %bb.0:
1473 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1474 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1475 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1476 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1477 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1478 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
1479 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
1480 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
1481 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1482 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v2
1483 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v3
1484 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1
1485 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1486 ; GCNX3-HSA-NEXT: s_endpgm
1488 ; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
1489 ; GCNX3-NOHSA: ; %bb.0:
1490 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1491 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1492 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1493 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1494 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1495 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1496 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1497 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1498 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1499 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1500 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1501 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1502 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1503 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v2
1504 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v3
1505 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v1
1506 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1507 ; GCNX3-NOHSA-NEXT: s_endpgm
1509 ; EG-LABEL: global_zextload_v2i32_to_v2i64:
1511 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1513 ; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
1514 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1517 ; EG-NEXT: Fetch clause starting at 6:
1518 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1519 ; EG-NEXT: ALU clause starting at 8:
1520 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1521 ; EG-NEXT: ALU clause starting at 9:
1522 ; EG-NEXT: MOV T1.X, T0.X,
1523 ; EG-NEXT: MOV T1.Y, 0.0,
1524 ; EG-NEXT: MOV T1.Z, T0.Y,
1525 ; EG-NEXT: MOV T1.W, 0.0,
1526 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1527 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1529 ; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64:
1531 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1532 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1533 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1534 ; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
1535 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1536 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v2
1537 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v3
1538 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
1539 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
1540 ; GCN-HSA-NEXT: s_endpgm
1541 %ld = load <2 x i32>, ptr addrspace(1) %in
1542 %ext = zext <2 x i32> %ld to <2 x i64>
1543 store <2 x i64> %ext, ptr addrspace(1) %out
1547 define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1548 ; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
1549 ; SI-NOHSA: ; %bb.0:
1550 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1551 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1552 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1553 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1554 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1555 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1556 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1557 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1558 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1559 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1560 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1561 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1562 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5
1563 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4
1564 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
1565 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
1566 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1567 ; SI-NOHSA-NEXT: s_endpgm
1569 ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
1570 ; GCNX3-HSA: ; %bb.0:
1571 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1572 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1573 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1574 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1575 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
1576 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s0
1577 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s1
1578 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1579 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5
1580 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4
1581 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4
1582 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5
1583 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
1584 ; GCNX3-HSA-NEXT: s_endpgm
1586 ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
1587 ; GCNX3-NOHSA: ; %bb.0:
1588 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1589 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1590 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1591 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1592 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1593 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1594 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1595 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1596 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1597 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1598 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1599 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1600 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5
1601 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4
1602 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v4
1603 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v5
1604 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1605 ; GCNX3-NOHSA-NEXT: s_endpgm
1607 ; EG-LABEL: global_sextload_v2i32_to_v2i64:
1609 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1611 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
1612 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1615 ; EG-NEXT: Fetch clause starting at 6:
1616 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1617 ; EG-NEXT: ALU clause starting at 8:
1618 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1619 ; EG-NEXT: ALU clause starting at 9:
1620 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.x,
1621 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1622 ; EG-NEXT: ASHR * T1.Y, T0.X, literal.x,
1623 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1624 ; EG-NEXT: MOV T1.X, T0.X,
1625 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1626 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1627 ; EG-NEXT: MOV * T1.Z, T0.Y,
1629 ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64:
1631 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1632 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0
1633 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1634 ; GCN-HSA-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3]
1635 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1636 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5
1637 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4
1638 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
1639 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
1640 ; GCN-HSA-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
1641 ; GCN-HSA-NEXT: s_endpgm
1642 %ld = load <2 x i32>, ptr addrspace(1) %in
1643 %ext = sext <2 x i32> %ld to <2 x i64>
1644 store <2 x i64> %ext, ptr addrspace(1) %out
1648 define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1649 ; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
1650 ; SI-NOHSA: ; %bb.0:
1651 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1652 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1653 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1654 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1655 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1656 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1657 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1658 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1659 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1660 ; SI-NOHSA-NEXT: v_mov_b32_e32 v5, 0
1661 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5
1662 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1663 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1664 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1665 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
1666 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
1667 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1668 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
1669 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0
1670 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1
1671 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1672 ; SI-NOHSA-NEXT: s_endpgm
1674 ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
1675 ; GCNX3-HSA: ; %bb.0:
1676 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1677 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0
1678 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
1679 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1680 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1681 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1682 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1683 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
1684 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1685 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
1686 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
1687 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1688 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
1689 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3
1690 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
1691 ; GCNX3-HSA-NEXT: s_nop 0
1692 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v0
1693 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v1
1694 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
1695 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
1696 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
1697 ; GCNX3-HSA-NEXT: s_endpgm
1699 ; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
1700 ; GCNX3-NOHSA: ; %bb.0:
1701 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1702 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1703 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1704 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1705 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1706 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1707 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1708 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1709 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1710 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, 0
1711 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v5
1712 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1713 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1714 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1715 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
1716 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
1717 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1718 ; GCNX3-NOHSA-NEXT: s_nop 0
1719 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
1720 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
1721 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1722 ; GCNX3-NOHSA-NEXT: s_endpgm
1724 ; EG-LABEL: global_zextload_v4i32_to_v4i64:
1726 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1728 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1729 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
1730 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
1732 ; EG-NEXT: Fetch clause starting at 6:
1733 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1734 ; EG-NEXT: ALU clause starting at 8:
1735 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1736 ; EG-NEXT: ALU clause starting at 9:
1737 ; EG-NEXT: MOV T1.X, T0.Z,
1738 ; EG-NEXT: MOV T1.Y, 0.0,
1739 ; EG-NEXT: MOV * T2.X, T0.X,
1740 ; EG-NEXT: MOV T2.Y, 0.0,
1741 ; EG-NEXT: MOV T1.Z, T0.W,
1742 ; EG-NEXT: MOV T1.W, 0.0,
1743 ; EG-NEXT: MOV * T2.Z, T0.Y,
1744 ; EG-NEXT: MOV * T2.W, 0.0,
1745 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1746 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
1747 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1748 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
1749 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1751 ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64:
1753 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1754 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1755 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
1756 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1757 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3]
1758 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1759 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
1760 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
1761 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
1762 ; GCN-HSA-NEXT: s_nop 0
1763 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
1764 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
1765 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
1766 ; GCN-HSA-NEXT: s_endpgm
1767 %ld = load <4 x i32>, ptr addrspace(1) %in
1768 %ext = zext <4 x i32> %ld to <4 x i64>
1769 store <4 x i64> %ext, ptr addrspace(1) %out
1773 define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1774 ; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
1775 ; SI-NOHSA: ; %bb.0:
1776 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1777 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1778 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1779 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1780 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1781 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1782 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1783 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1784 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1785 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1786 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1787 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1788 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1789 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1790 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1791 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1792 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v2
1793 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v3
1794 ; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v0
1795 ; SI-NOHSA-NEXT: v_mov_b32_e32 v5, v1
1796 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
1797 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
1798 ; SI-NOHSA-NEXT: s_endpgm
1800 ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
1801 ; GCNX3-HSA: ; %bb.0:
1802 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1803 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1804 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1805 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1806 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1807 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
1808 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1809 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3
1810 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
1811 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2
1812 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0
1813 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1814 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1815 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1816 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v2
1817 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v3
1818 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1819 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1820 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v0
1821 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, v1
1822 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
1823 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[3:6]
1824 ; GCNX3-HSA-NEXT: s_endpgm
1826 ; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
1827 ; GCNX3-NOHSA: ; %bb.0:
1828 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1829 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1830 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1831 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1832 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1833 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1834 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1835 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1836 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1837 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1838 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1839 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1840 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1841 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1842 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v2
1843 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v3
1844 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1845 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1846 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v0
1847 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v1
1848 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
1849 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
1850 ; GCNX3-NOHSA-NEXT: s_endpgm
1852 ; EG-LABEL: global_sextload_v4i32_to_v4i64:
1854 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1856 ; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
1857 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
1858 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
1860 ; EG-NEXT: Fetch clause starting at 6:
1861 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1862 ; EG-NEXT: ALU clause starting at 8:
1863 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1864 ; EG-NEXT: ALU clause starting at 9:
1865 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.x,
1866 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1867 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
1868 ; EG-NEXT: ASHR T1.Y, T0.X, literal.y,
1869 ; EG-NEXT: ASHR T3.W, T0.W, literal.y,
1870 ; EG-NEXT: MOV * T1.X, T0.X,
1871 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
1872 ; EG-NEXT: ASHR * T3.Y, T0.Z, literal.x,
1873 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1874 ; EG-NEXT: MOV T3.X, T0.Z,
1875 ; EG-NEXT: MOV T1.Z, T0.Y,
1876 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
1877 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1878 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
1879 ; EG-NEXT: MOV * T3.Z, T0.W,
1880 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1882 ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64:
1884 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1885 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0
1886 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1887 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3]
1888 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1889 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1890 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1891 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v2
1892 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v3
1893 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1894 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1895 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v0
1896 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
1897 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[7:10], s[0:1] offset:16
1898 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[3:6], s[0:1]
1899 ; GCN-HSA-NEXT: s_endpgm
1900 %ld = load <4 x i32>, ptr addrspace(1) %in
1901 %ext = sext <4 x i32> %ld to <4 x i64>
1902 store <4 x i64> %ext, ptr addrspace(1) %out
1906 define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1907 ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
1908 ; SI-NOHSA: ; %bb.0:
1909 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
1910 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
1911 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
1912 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
1913 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
1914 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1915 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
1916 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
1917 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
1918 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
1919 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, 0
1920 ; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v9
1921 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
1922 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
1923 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
1924 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v2
1925 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v3
1926 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
1927 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
1928 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v0
1929 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v1
1930 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1931 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) expcnt(0)
1932 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v6
1933 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v7
1934 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
1935 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
1936 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v4
1937 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v5
1938 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
1939 ; SI-NOHSA-NEXT: s_endpgm
1941 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
1942 ; GCNX3-HSA: ; %bb.0:
1943 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1944 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0
1945 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
1946 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1947 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1948 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1949 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
1950 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
1951 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1952 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
1953 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
1954 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1955 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
1956 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1957 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
1958 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
1959 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
1960 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
1961 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1962 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
1963 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
1964 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
1965 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
1966 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
1967 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
1968 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v2
1969 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v3
1970 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
1971 ; GCNX3-HSA-NEXT: s_nop 0
1972 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
1973 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
1974 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
1975 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
1976 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
1977 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
1978 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
1979 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
1980 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
1981 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4
1982 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5
1983 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
1984 ; GCNX3-HSA-NEXT: s_endpgm
1986 ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
1987 ; GCNX3-NOHSA: ; %bb.0:
1988 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
1989 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
1990 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
1991 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
1992 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
1993 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1994 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
1995 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
1996 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
1997 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
1998 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0
1999 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9
2000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2001 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2002 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
2003 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2
2004 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3
2005 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
2006 ; GCNX3-NOHSA-NEXT: s_nop 0
2007 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0
2008 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1
2009 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2010 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
2011 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
2012 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
2013 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2014 ; GCNX3-NOHSA-NEXT: s_nop 0
2015 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
2016 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
2017 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
2018 ; GCNX3-NOHSA-NEXT: s_endpgm
2020 ; EG-LABEL: global_zextload_v8i32_to_v8i64:
2022 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
2024 ; EG-NEXT: ALU 26, @13, KC0[CB0:0-32], KC1[]
2025 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
2026 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
2027 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
2028 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 1
2030 ; EG-NEXT: Fetch clause starting at 8:
2031 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
2032 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2033 ; EG-NEXT: ALU clause starting at 12:
2034 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2035 ; EG-NEXT: ALU clause starting at 13:
2036 ; EG-NEXT: MOV T2.X, T1.Z,
2037 ; EG-NEXT: MOV T2.Y, 0.0,
2038 ; EG-NEXT: MOV * T3.X, T1.X,
2039 ; EG-NEXT: MOV * T3.Y, 0.0,
2040 ; EG-NEXT: MOV T4.X, T0.Z,
2041 ; EG-NEXT: MOV T4.Y, 0.0,
2042 ; EG-NEXT: MOV * T5.X, T0.X,
2043 ; EG-NEXT: MOV T5.Y, 0.0,
2044 ; EG-NEXT: MOV T2.Z, T1.W,
2045 ; EG-NEXT: MOV T2.W, 0.0,
2046 ; EG-NEXT: MOV * T3.Z, T1.Y,
2047 ; EG-NEXT: MOV * T3.W, 0.0,
2048 ; EG-NEXT: MOV T4.Z, T0.W,
2049 ; EG-NEXT: MOV T4.W, 0.0,
2050 ; EG-NEXT: MOV * T5.Z, T0.Y,
2051 ; EG-NEXT: MOV * T5.W, 0.0,
2052 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2053 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2054 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2055 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
2056 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2057 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2058 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
2059 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2060 ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
2061 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
2062 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2064 ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64:
2066 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2067 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
2068 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
2069 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
2070 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:16
2071 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3]
2072 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
2073 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
2074 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
2075 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
2076 ; GCN-HSA-NEXT: s_nop 0
2077 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
2078 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
2079 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
2080 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
2081 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
2082 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
2083 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
2084 ; GCN-HSA-NEXT: s_nop 0
2085 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8
2086 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9
2087 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
2088 ; GCN-HSA-NEXT: s_endpgm
2089 %ld = load <8 x i32>, ptr addrspace(1) %in
2090 %ext = zext <8 x i32> %ld to <8 x i64>
2091 store <8 x i64> %ext, ptr addrspace(1) %out
2095 define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2096 ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
2097 ; SI-NOHSA: ; %bb.0:
2098 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
2099 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2100 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
2101 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
2102 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
2103 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2104 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
2105 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
2106 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2107 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
2108 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
2109 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2110 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
2111 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
2112 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
2113 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3
2114 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
2115 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
2116 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5
2117 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4
2118 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2119 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6
2120 ; SI-NOHSA-NEXT: v_mov_b32_e32 v19, v6
2121 ; SI-NOHSA-NEXT: v_mov_b32_e32 v21, v7
2122 ; SI-NOHSA-NEXT: v_mov_b32_e32 v15, v4
2123 ; SI-NOHSA-NEXT: v_mov_b32_e32 v17, v5
2124 ; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v2
2125 ; SI-NOHSA-NEXT: v_mov_b32_e32 v13, v3
2126 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v0
2127 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v1
2128 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
2129 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
2130 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
2131 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
2132 ; SI-NOHSA-NEXT: s_endpgm
2134 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
2135 ; GCNX3-HSA: ; %bb.0:
2136 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2137 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
2138 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2139 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2140 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
2141 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
2142 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
2143 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2144 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
2145 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2146 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
2147 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2148 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3
2149 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2
2150 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
2151 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
2152 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2153 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
2154 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
2155 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
2156 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
2157 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
2158 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
2159 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0
2160 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
2161 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
2162 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
2163 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3
2164 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v2
2165 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v2
2166 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v3
2167 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
2168 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
2169 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
2170 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
2171 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
2172 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5
2173 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
2174 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
2175 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
2176 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
2177 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4
2178 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4
2179 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5
2180 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
2181 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
2182 ; GCNX3-HSA-NEXT: s_endpgm
2184 ; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
2185 ; GCNX3-NOHSA: ; %bb.0:
2186 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
2187 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2188 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
2189 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
2190 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
2191 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2192 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
2193 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
2194 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2195 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2196 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2197 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2198 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
2199 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
2200 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
2201 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2202 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6
2203 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v6
2204 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v7
2205 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
2206 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3
2207 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
2208 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5
2209 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4
2210 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v15, v4
2211 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, v5
2212 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v2
2213 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v13, v3
2214 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v0
2215 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v1
2216 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
2217 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
2218 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
2219 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
2220 ; GCNX3-NOHSA-NEXT: s_endpgm
2222 ; EG-LABEL: global_sextload_v8i32_to_v8i64:
2224 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
2226 ; EG-NEXT: ALU 31, @13, KC0[CB0:0-32], KC1[]
2227 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0
2228 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
2229 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0
2230 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 1
2232 ; EG-NEXT: Fetch clause starting at 8:
2233 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
2234 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2235 ; EG-NEXT: ALU clause starting at 12:
2236 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2237 ; EG-NEXT: ALU clause starting at 13:
2238 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2239 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
2240 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2241 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2242 ; EG-NEXT: ADD_INT T2.W, KC0[2].Y, literal.y,
2243 ; EG-NEXT: ASHR * T4.W, T0.Y, literal.z,
2244 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2245 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2246 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
2247 ; EG-NEXT: ASHR T4.Y, T0.X, literal.y,
2248 ; EG-NEXT: ASHR T6.W, T0.W, literal.y,
2249 ; EG-NEXT: MOV * T4.X, T0.X,
2250 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
2251 ; EG-NEXT: ASHR T6.Y, T0.Z, literal.x,
2252 ; EG-NEXT: ASHR * T7.W, T1.Y, literal.x,
2253 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2254 ; EG-NEXT: MOV T6.X, T0.Z,
2255 ; EG-NEXT: ASHR T7.Y, T1.X, literal.x,
2256 ; EG-NEXT: MOV T4.Z, T0.Y,
2257 ; EG-NEXT: ASHR T8.W, T1.W, literal.x,
2258 ; EG-NEXT: MOV * T7.X, T1.X,
2259 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2260 ; EG-NEXT: ASHR T8.Y, T1.Z, literal.x,
2261 ; EG-NEXT: MOV * T6.Z, T0.W,
2262 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2263 ; EG-NEXT: MOV T8.X, T1.Z,
2264 ; EG-NEXT: MOV T7.Z, T1.Y,
2265 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
2266 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
2267 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
2268 ; EG-NEXT: MOV * T8.Z, T1.W,
2269 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2271 ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64:
2273 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2274 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0
2275 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
2276 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3]
2277 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v23, s[2:3] offset:16
2278 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
2279 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
2280 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
2281 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2282 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6
2283 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v6
2284 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v7
2285 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
2286 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3
2287 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
2288 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5
2289 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4
2290 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v4
2291 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v5
2292 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v2
2293 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v3
2294 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v0
2295 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v1
2296 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[19:22], s[0:1] offset:48
2297 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[15:18], s[0:1] offset:32
2298 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[11:14], s[0:1] offset:16
2299 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[7:10], s[0:1]
2300 ; GCN-HSA-NEXT: s_endpgm
2301 %ld = load <8 x i32>, ptr addrspace(1) %in
2302 %ext = sext <8 x i32> %ld to <8 x i64>
2303 store <8 x i64> %ext, ptr addrspace(1) %out
2307 define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2308 ; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
2309 ; SI-NOHSA: ; %bb.0:
2310 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
2311 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2312 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
2313 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
2314 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
2315 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2316 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
2317 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
2318 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2319 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
2320 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2321 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2322 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
2323 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
2324 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
2325 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
2326 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v0
2327 ; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v0
2328 ; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v1
2329 ; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v2
2330 ; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v3
2331 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
2332 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
2333 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6
2334 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
2335 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
2336 ; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v4
2337 ; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5
2338 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
2339 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
2340 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
2341 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
2342 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
2343 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
2344 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
2345 ; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
2346 ; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
2347 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10
2348 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11
2349 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
2350 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2351 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2352 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
2353 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
2354 ; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12
2355 ; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13
2356 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14
2357 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15
2358 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
2359 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
2360 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
2361 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
2362 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
2363 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
2364 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
2365 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
2366 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
2367 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2368 ; SI-NOHSA-NEXT: s_endpgm
2370 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2371 ; GCNX3-HSA: ; %bb.0:
2372 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2373 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
2374 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2375 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2376 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
2377 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
2378 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
2379 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
2380 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
2381 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
2382 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
2383 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
2384 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
2385 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
2386 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
2387 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
2388 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
2389 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2390 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2391 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2392 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
2393 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2394 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
2395 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
2396 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
2397 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2398 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
2399 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
2400 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
2401 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2402 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
2403 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
2404 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
2405 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
2406 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
2407 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2408 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
2409 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
2410 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12
2411 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
2412 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
2413 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
2414 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
2415 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
2416 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
2417 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2418 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
2419 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
2420 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
2421 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v15
2422 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v14
2423 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14
2424 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15
2425 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2426 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
2427 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
2428 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v11
2429 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v10
2430 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9
2431 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8
2432 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
2433 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
2434 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v10
2435 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v11
2436 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
2437 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
2438 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
2439 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
2440 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15]
2441 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
2442 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
2443 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
2444 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
2445 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
2446 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
2447 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
2448 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
2449 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
2450 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
2451 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1
2452 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
2453 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
2454 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0
2455 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
2456 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
2457 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
2458 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
2459 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
2460 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
2461 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
2462 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
2463 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3
2464 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
2465 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7]
2466 ; GCNX3-HSA-NEXT: s_endpgm
2468 ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
2469 ; GCNX3-NOHSA: ; %bb.0:
2470 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
2471 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2472 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
2473 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
2474 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
2475 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2476 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
2477 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
2478 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
2479 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
2480 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2481 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2482 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2483 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2484 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
2485 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
2486 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
2487 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
2488 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
2489 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v4
2490 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v5
2491 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
2492 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
2493 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
2494 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v6
2495 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v7
2496 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
2497 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
2498 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
2499 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
2500 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2
2501 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3
2502 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
2503 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
2504 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
2505 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
2506 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
2507 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8
2508 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9
2509 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v10
2510 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v11
2511 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
2512 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2513 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2514 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
2515 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
2516 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v12
2517 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v13
2518 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v14
2519 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v15
2520 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
2521 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
2522 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2523 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
2524 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
2525 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
2526 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
2527 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2528 ; GCNX3-NOHSA-NEXT: s_endpgm
2530 ; EG-LABEL: global_sextload_v16i32_to_v16i64:
2532 ; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
2533 ; EG-NEXT: TEX 3 @12
2534 ; EG-NEXT: ALU 64, @21, KC0[CB0:0-32], KC1[]
2535 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0
2536 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0
2537 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0
2538 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 0
2539 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
2540 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T6.X, 0
2541 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T5.X, 0
2542 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T4.X, 1
2544 ; EG-NEXT: Fetch clause starting at 12:
2545 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
2546 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
2547 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1
2548 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2549 ; EG-NEXT: ALU clause starting at 20:
2550 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2551 ; EG-NEXT: ALU clause starting at 21:
2552 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x,
2553 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2554 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
2555 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
2556 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2557 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x,
2558 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
2559 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
2560 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
2561 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2562 ; EG-NEXT: LSHR T7.X, PV.W, literal.x,
2563 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
2564 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
2565 ; EG-NEXT: LSHR T8.X, PV.W, literal.x,
2566 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
2567 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
2568 ; EG-NEXT: LSHR T9.X, PV.W, literal.x,
2569 ; EG-NEXT: ADD_INT T4.W, KC0[2].Y, literal.y,
2570 ; EG-NEXT: ASHR * T10.W, T0.W, literal.z,
2571 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
2572 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2573 ; EG-NEXT: LSHR T11.X, PV.W, literal.x,
2574 ; EG-NEXT: ASHR T10.Y, T0.Z, literal.y,
2575 ; EG-NEXT: ASHR T12.W, T0.Y, literal.y,
2576 ; EG-NEXT: MOV * T10.X, T0.Z,
2577 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
2578 ; EG-NEXT: ASHR T12.Y, T0.X, literal.x,
2579 ; EG-NEXT: ASHR * T13.W, T3.W, literal.x,
2580 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2581 ; EG-NEXT: MOV T12.X, T0.X,
2582 ; EG-NEXT: ASHR T13.Y, T3.Z, literal.x,
2583 ; EG-NEXT: MOV T10.Z, T0.W,
2584 ; EG-NEXT: ASHR T14.W, T3.Y, literal.x,
2585 ; EG-NEXT: MOV * T13.X, T3.Z,
2586 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2587 ; EG-NEXT: ASHR T14.Y, T3.X, literal.x,
2588 ; EG-NEXT: MOV T12.Z, T0.Y,
2589 ; EG-NEXT: ASHR * T0.W, T2.W, literal.x,
2590 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2591 ; EG-NEXT: MOV T14.X, T3.X,
2592 ; EG-NEXT: ASHR T0.Y, T2.Z, literal.x,
2593 ; EG-NEXT: MOV T13.Z, T3.W,
2594 ; EG-NEXT: ASHR T15.W, T2.Y, literal.x,
2595 ; EG-NEXT: MOV * T0.X, T2.Z,
2596 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2597 ; EG-NEXT: ASHR T15.Y, T2.X, literal.x,
2598 ; EG-NEXT: MOV T14.Z, T3.Y,
2599 ; EG-NEXT: ASHR * T3.W, T1.W, literal.x,
2600 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2601 ; EG-NEXT: MOV T15.X, T2.X,
2602 ; EG-NEXT: ASHR T3.Y, T1.Z, literal.x,
2603 ; EG-NEXT: MOV T0.Z, T2.W,
2604 ; EG-NEXT: ASHR T16.W, T1.Y, literal.x,
2605 ; EG-NEXT: MOV * T3.X, T1.Z,
2606 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2607 ; EG-NEXT: ASHR T16.Y, T1.X, literal.x,
2608 ; EG-NEXT: MOV * T15.Z, T2.Y,
2609 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2610 ; EG-NEXT: MOV T16.X, T1.X,
2611 ; EG-NEXT: MOV T3.Z, T1.W,
2612 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2613 ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
2614 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
2615 ; EG-NEXT: MOV * T16.Z, T1.Y,
2616 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2618 ; GCN-GFX900-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2619 ; GCN-GFX900-HSA: ; %bb.0:
2620 ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2621 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v36, 0
2622 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
2623 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
2624 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
2625 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
2626 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3]
2627 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(3)
2628 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
2629 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(2)
2630 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
2631 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
2632 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v24, v4
2633 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v26, v5
2634 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
2635 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
2636 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
2637 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v20, v6
2638 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v22, v7
2639 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
2640 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
2641 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
2642 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
2643 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v16, v2
2644 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v18, v3
2645 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
2646 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
2647 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
2648 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
2649 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
2650 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v8
2651 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v9
2652 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v10
2653 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v11
2654 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
2655 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2656 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2657 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
2658 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
2659 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v32, v12
2660 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v34, v13
2661 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v14
2662 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v15
2663 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
2664 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
2665 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
2666 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
2667 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
2668 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
2669 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1]
2670 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
2671 ; GCN-GFX900-HSA-NEXT: s_endpgm
2673 ; GCN-GFX908-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2674 ; GCN-GFX908-HSA: ; %bb.0:
2675 ; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2676 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, 0
2677 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0)
2678 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:32
2679 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:48
2680 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[9:12], v0, s[2:3] offset:16
2681 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v0, s[2:3]
2682 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
2683 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v4
2684 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2)
2685 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v6
2686 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v5
2687 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v5
2688 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v6
2689 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v3
2690 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v8
2691 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2692 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v21, v7
2693 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v23, v8
2694 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
2695 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
2696 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v5, v1
2697 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v7, v2
2698 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v3
2699 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v4
2700 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1)
2701 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v12
2702 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v11
2703 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10
2704 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v9
2705 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v9
2706 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v10
2707 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v1, v11
2708 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v3, v12
2709 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(0)
2710 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16
2711 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15
2712 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v14
2713 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v13
2714 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v13
2715 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v14
2716 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v9, v15
2717 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v11, v16
2718 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:96
2719 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:112
2720 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:64
2721 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
2722 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:32
2723 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:48
2724 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[33:36], s[0:1]
2725 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:16
2726 ; GCN-GFX908-HSA-NEXT: s_endpgm
2727 %ld = load <16 x i32>, ptr addrspace(1) %in
2728 %ext = sext <16 x i32> %ld to <16 x i64>
2729 store <16 x i64> %ext, ptr addrspace(1) %out
2733 define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2734 ; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
2735 ; SI-NOHSA: ; %bb.0:
2736 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
2737 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2738 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
2739 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
2740 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
2741 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2742 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
2743 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
2744 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2745 ; SI-NOHSA-NEXT: v_mov_b32_e32 v5, 0
2746 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
2747 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5
2748 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
2749 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
2750 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2751 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
2752 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
2753 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0
2754 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1
2755 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
2756 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2757 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
2758 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
2759 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
2760 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
2761 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v8
2762 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v9
2763 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2764 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2765 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10
2766 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11
2767 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
2768 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
2769 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v16
2770 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v17
2771 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
2772 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2773 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v18
2774 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v19
2775 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
2776 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2777 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v12
2778 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v13
2779 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
2780 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2781 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v14
2782 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v15
2783 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2784 ; SI-NOHSA-NEXT: s_endpgm
2786 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
2787 ; GCNX3-HSA: ; %bb.0:
2788 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2789 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0
2790 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
2791 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
2792 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
2793 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
2794 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
2795 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2796 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
2797 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2798 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 48
2799 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
2800 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2801 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
2802 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
2803 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2804 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
2805 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
2806 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
2807 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
2808 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
2809 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
2810 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
2811 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2812 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
2813 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
2814 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
2815 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2816 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
2817 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
2818 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
2819 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2820 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
2821 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
2822 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
2823 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
2824 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
2825 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2826 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
2827 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0
2828 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1
2829 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2830 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
2831 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2832 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
2833 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2
2834 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3
2835 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2836 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2
2837 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
2838 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3
2839 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
2840 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4
2841 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5
2842 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
2843 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
2844 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2845 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6
2846 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7
2847 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19]
2848 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
2849 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
2850 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
2851 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
2852 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
2853 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
2854 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
2855 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10
2856 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11
2857 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
2858 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
2859 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
2860 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
2861 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
2862 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
2863 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
2864 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
2865 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14
2866 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15
2867 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
2868 ; GCNX3-HSA-NEXT: s_endpgm
2870 ; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
2871 ; GCNX3-NOHSA: ; %bb.0:
2872 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
2873 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2874 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
2875 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
2876 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
2877 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2878 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
2879 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
2880 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2881 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
2882 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2883 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2884 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, 0
2885 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v17
2886 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2887 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2888 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
2889 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v0
2890 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v1
2891 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
2892 ; GCNX3-NOHSA-NEXT: s_nop 0
2893 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2
2894 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3
2895 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
2896 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4)
2897 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v4
2898 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v5
2899 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
2900 ; GCNX3-NOHSA-NEXT: s_nop 0
2901 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v6
2902 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v7
2903 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
2904 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5)
2905 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v8
2906 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v9
2907 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
2908 ; GCNX3-NOHSA-NEXT: s_nop 0
2909 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v10
2910 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v11
2911 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
2912 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
2913 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v12
2914 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v13
2915 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
2916 ; GCNX3-NOHSA-NEXT: s_nop 0
2917 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v14
2918 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v15
2919 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
2920 ; GCNX3-NOHSA-NEXT: s_endpgm
2922 ; EG-LABEL: global_zextload_v16i32_to_v16i64:
2924 ; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
2925 ; EG-NEXT: TEX 3 @12
2926 ; EG-NEXT: ALU 55, @21, KC0[CB0:0-32], KC1[]
2927 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0
2928 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0
2929 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0
2930 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
2931 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T3.X, 0
2932 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T2.X, 0
2933 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T1.X, 0
2934 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T0.X, 1
2936 ; EG-NEXT: Fetch clause starting at 12:
2937 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
2938 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
2939 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1
2940 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1
2941 ; EG-NEXT: ALU clause starting at 20:
2942 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2943 ; EG-NEXT: ALU clause starting at 21:
2944 ; EG-NEXT: MOV T4.X, T1.X,
2945 ; EG-NEXT: MOV T4.Y, 0.0,
2946 ; EG-NEXT: MOV * T5.X, T1.Z,
2947 ; EG-NEXT: MOV * T5.Y, 0.0,
2948 ; EG-NEXT: MOV T6.X, T0.X,
2949 ; EG-NEXT: MOV T6.Y, 0.0,
2950 ; EG-NEXT: MOV * T7.X, T0.Z,
2951 ; EG-NEXT: MOV * T7.Y, 0.0,
2952 ; EG-NEXT: MOV T8.X, T3.X,
2953 ; EG-NEXT: MOV T8.Y, 0.0,
2954 ; EG-NEXT: MOV * T9.X, T3.Z,
2955 ; EG-NEXT: MOV * T9.Y, 0.0,
2956 ; EG-NEXT: MOV T10.X, T2.X,
2957 ; EG-NEXT: MOV T10.Y, 0.0,
2958 ; EG-NEXT: MOV * T11.X, T2.Z,
2959 ; EG-NEXT: MOV T11.Y, 0.0,
2960 ; EG-NEXT: MOV T4.Z, T1.Y,
2961 ; EG-NEXT: MOV T4.W, 0.0,
2962 ; EG-NEXT: MOV * T5.Z, T1.W,
2963 ; EG-NEXT: MOV * T5.W, 0.0,
2964 ; EG-NEXT: MOV T6.Z, T0.Y,
2965 ; EG-NEXT: MOV T6.W, 0.0,
2966 ; EG-NEXT: MOV * T7.Z, T0.W,
2967 ; EG-NEXT: MOV * T7.W, 0.0,
2968 ; EG-NEXT: MOV T8.Z, T3.Y,
2969 ; EG-NEXT: MOV T8.W, 0.0,
2970 ; EG-NEXT: MOV * T9.Z, T3.W,
2971 ; EG-NEXT: MOV * T9.W, 0.0,
2972 ; EG-NEXT: MOV T10.Z, T2.Y,
2973 ; EG-NEXT: MOV T10.W, 0.0,
2974 ; EG-NEXT: MOV * T11.Z, T2.W,
2975 ; EG-NEXT: MOV T11.W, 0.0,
2976 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
2977 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2978 ; EG-NEXT: LSHR T0.X, PS, literal.x,
2979 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2980 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2981 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
2982 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
2983 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
2984 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2985 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2986 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2987 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2988 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
2989 ; EG-NEXT: LSHR T12.X, PV.W, literal.x,
2990 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2991 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
2992 ; EG-NEXT: LSHR T13.X, PV.W, literal.x,
2993 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2994 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
2995 ; EG-NEXT: LSHR T14.X, PV.W, literal.x,
2996 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2997 ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
2998 ; EG-NEXT: LSHR * T15.X, PV.W, literal.x,
2999 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3001 ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64:
3003 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3004 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
3005 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
3006 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
3007 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:48
3008 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:32
3009 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:16
3010 ; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3]
3011 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
3012 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
3013 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
3014 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
3015 ; GCN-HSA-NEXT: s_nop 0
3016 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
3017 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
3018 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
3019 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
3020 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8
3021 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9
3022 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
3023 ; GCN-HSA-NEXT: s_nop 0
3024 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
3025 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
3026 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
3027 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
3028 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v12
3029 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v13
3030 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
3031 ; GCN-HSA-NEXT: s_nop 0
3032 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v14
3033 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v15
3034 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
3035 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
3036 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v16
3037 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v17
3038 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
3039 ; GCN-HSA-NEXT: s_nop 0
3040 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v18
3041 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v19
3042 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
3043 ; GCN-HSA-NEXT: s_endpgm
3044 %ld = load <16 x i32>, ptr addrspace(1) %in
3045 %ext = zext <16 x i32> %ld to <16 x i64>
3046 store <16 x i64> %ext, ptr addrspace(1) %out
3050 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3051 ; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
3052 ; SI-NOHSA: ; %bb.0:
3053 ; SI-NOHSA-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
3054 ; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
3055 ; SI-NOHSA-NEXT: s_mov_b32 s14, -1
3056 ; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000
3057 ; SI-NOHSA-NEXT: s_add_u32 s12, s12, s11
3058 ; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0
3059 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
3060 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
3061 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
3062 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
3063 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
3064 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
3065 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
3066 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
3067 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
3068 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
3069 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
3070 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
3071 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
3072 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3073 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
3074 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
3075 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
3076 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31
3077 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30
3078 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
3079 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
3080 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
3081 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13
3082 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12
3083 ; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v12
3084 ; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13
3085 ; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14
3086 ; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15
3087 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
3088 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
3089 ; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28
3090 ; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29
3091 ; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30
3092 ; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31
3093 ; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
3094 ; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
3095 ; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
3096 ; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
3097 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(9)
3098 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7
3099 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
3100 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3101 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5
3102 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4
3103 ; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v4
3104 ; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5
3105 ; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6
3106 ; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7
3107 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8)
3108 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
3109 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
3110 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
3111 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0
3112 ; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v0
3113 ; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1
3114 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
3115 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
3116 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
3117 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
3118 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
3119 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
3120 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
3121 ; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16
3122 ; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17
3123 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18
3124 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19
3125 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
3126 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
3127 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
3128 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
3129 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
3130 ; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20
3131 ; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21
3132 ; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22
3133 ; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23
3134 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
3135 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27
3136 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26
3137 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25
3138 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24
3139 ; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24
3140 ; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25
3141 ; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26
3142 ; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27
3143 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
3144 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11
3145 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10
3146 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
3147 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
3148 ; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
3149 ; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
3150 ; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10
3151 ; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11
3152 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
3153 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
3154 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
3155 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
3156 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
3157 ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
3158 ; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
3159 ; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
3160 ; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
3161 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
3162 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
3163 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
3164 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
3165 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
3166 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
3167 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
3168 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3169 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
3170 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
3171 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
3172 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
3173 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
3174 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
3175 ; SI-NOHSA-NEXT: s_endpgm
3177 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3178 ; GCNX3-HSA: ; %bb.0:
3179 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3180 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
3181 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3182 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3183 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
3184 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70
3185 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3186 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3187 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3188 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1]
3189 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x60
3190 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3191 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3192 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3193 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50
3194 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
3195 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3196 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3197 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3198 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
3199 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
3200 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3201 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3202 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3203 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
3204 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
3205 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3206 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
3207 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
3208 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13]
3209 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
3210 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
3211 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
3212 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
3213 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3214 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
3215 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3216 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6
3217 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
3218 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
3219 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
3220 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3221 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1
3222 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0
3223 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
3224 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
3225 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
3226 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28
3227 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29
3228 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
3229 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
3230 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0
3231 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3232 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35]
3233 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3
3234 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2
3235 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0
3236 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31
3237 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30
3238 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30
3239 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31
3240 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3241 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35]
3242 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8)
3243 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25
3244 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3
3245 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2
3246 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0
3247 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3248 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3
3249 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2
3250 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0
3251 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24
3252 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24
3253 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25
3254 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3255 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
3256 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3
3257 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2
3258 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0
3259 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27
3260 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26
3261 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26
3262 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27
3263 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3264 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
3265 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3
3266 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2
3267 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0
3268 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3269 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3
3270 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2
3271 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80
3272 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
3273 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v21
3274 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20
3275 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20
3276 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21
3277 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3278 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23
3279 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22
3280 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v22
3281 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v23
3282 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27]
3283 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
3284 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
3285 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v11
3286 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v10
3287 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9
3288 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8
3289 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v8
3290 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v9
3291 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v10
3292 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v11
3293 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
3294 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
3295 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
3296 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4
3297 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5
3298 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
3299 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
3300 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
3301 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3302 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
3303 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
3304 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
3305 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
3306 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
3307 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
3308 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7
3309 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6
3310 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6
3311 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7
3312 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3313 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
3314 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
3315 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17
3316 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16
3317 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16
3318 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17
3319 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
3320 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
3321 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
3322 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3323 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
3324 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
3325 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
3326 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
3327 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19
3328 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18
3329 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18
3330 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19
3331 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3332 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[23:26]
3333 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
3334 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12
3335 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
3336 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3
3337 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2
3338 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
3339 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
3340 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
3341 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3342 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[16:19]
3343 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3
3344 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v15
3345 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v14
3346 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v14
3347 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v15
3348 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2
3349 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
3350 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
3351 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
3352 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
3353 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v1
3354 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v0
3355 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3356 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3357 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3358 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
3359 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
3360 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
3361 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
3362 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3
3363 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2
3364 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2
3365 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3
3366 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
3367 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
3368 ; GCNX3-HSA-NEXT: s_endpgm
3370 ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
3371 ; GCNX3-NOHSA: ; %bb.0:
3372 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
3373 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
3374 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
3375 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
3376 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
3377 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
3378 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
3379 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
3380 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
3381 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
3382 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
3383 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
3384 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
3385 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3386 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
3387 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
3388 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
3389 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
3390 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
3391 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11
3392 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10
3393 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
3394 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
3395 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
3396 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13
3397 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12
3398 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12
3399 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13
3400 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14
3401 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15
3402 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
3403 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8
3404 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8
3405 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9
3406 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10
3407 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11
3408 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5)
3409 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
3410 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
3411 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5
3412 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4
3413 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4
3414 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5
3415 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
3416 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
3417 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4)
3418 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
3419 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
3420 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
3421 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
3422 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
3423 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
3424 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19
3425 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
3426 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
3427 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
3428 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0
3429 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0
3430 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1
3431 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
3432 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
3433 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
3434 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16
3435 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17
3436 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18
3437 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
3438 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
3439 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
3440 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v56, v20
3441 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v58, v21
3442 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v22
3443 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v23
3444 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
3445 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27
3446 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26
3447 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
3448 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
3449 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
3450 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24
3451 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
3452 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31
3453 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30
3454 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
3455 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31
3456 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29
3457 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28
3458 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28
3459 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29
3460 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
3461 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
3462 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
3463 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
3464 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
3465 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
3466 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3467 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
3468 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
3469 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
3470 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30
3471 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24
3472 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25
3473 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26
3474 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27
3475 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
3476 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0
3477 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
3478 ; GCNX3-NOHSA-NEXT: s_endpgm
3480 ; EG-LABEL: global_sextload_v32i32_to_v32i64:
3482 ; EG-NEXT: ALU 33, @36, KC0[CB0:0-32], KC1[]
3483 ; EG-NEXT: TEX 7 @20
3484 ; EG-NEXT: ALU 96, @70, KC0[CB0:0-32], KC1[]
3485 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0
3486 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
3487 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
3488 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
3489 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
3490 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0
3491 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0
3492 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0
3493 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0
3494 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0
3495 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0
3496 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
3497 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0
3498 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0
3499 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0
3500 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1
3502 ; EG-NEXT: Fetch clause starting at 20:
3503 ; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 112, #1
3504 ; EG-NEXT: VTX_READ_128 T13.XYZW, T11.X, 96, #1
3505 ; EG-NEXT: VTX_READ_128 T14.XYZW, T11.X, 80, #1
3506 ; EG-NEXT: VTX_READ_128 T15.XYZW, T11.X, 64, #1
3507 ; EG-NEXT: VTX_READ_128 T16.XYZW, T11.X, 48, #1
3508 ; EG-NEXT: VTX_READ_128 T17.XYZW, T11.X, 32, #1
3509 ; EG-NEXT: VTX_READ_128 T18.XYZW, T11.X, 16, #1
3510 ; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
3511 ; EG-NEXT: ALU clause starting at 36:
3512 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3513 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3514 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
3515 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3516 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3517 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3518 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
3519 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
3520 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3521 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
3522 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
3523 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3524 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
3525 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
3526 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3527 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
3528 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
3529 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3530 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
3531 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
3532 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3533 ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
3534 ; EG-NEXT: LSHR T7.X, PV.W, literal.x,
3535 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3536 ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
3537 ; EG-NEXT: LSHR T8.X, PV.W, literal.x,
3538 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3539 ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
3540 ; EG-NEXT: LSHR T9.X, PV.W, literal.x,
3541 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3542 ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
3543 ; EG-NEXT: LSHR T10.X, PV.W, literal.x,
3544 ; EG-NEXT: MOV * T11.X, KC0[2].Z,
3545 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3546 ; EG-NEXT: ALU clause starting at 70:
3547 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3548 ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
3549 ; EG-NEXT: LSHR T19.X, PV.W, literal.x,
3550 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3551 ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
3552 ; EG-NEXT: LSHR T20.X, PV.W, literal.x,
3553 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3554 ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
3555 ; EG-NEXT: LSHR T21.X, PV.W, literal.x,
3556 ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
3557 ; EG-NEXT: ASHR * T22.W, T11.W, literal.z,
3558 ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
3559 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3560 ; EG-NEXT: LSHR T23.X, PV.W, literal.x,
3561 ; EG-NEXT: ASHR T22.Y, T11.Z, literal.y,
3562 ; EG-NEXT: ASHR T24.W, T11.Y, literal.y,
3563 ; EG-NEXT: MOV * T22.X, T11.Z,
3564 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
3565 ; EG-NEXT: ASHR T24.Y, T11.X, literal.x,
3566 ; EG-NEXT: ASHR * T25.W, T18.W, literal.x,
3567 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3568 ; EG-NEXT: MOV T24.X, T11.X,
3569 ; EG-NEXT: ASHR T25.Y, T18.Z, literal.x,
3570 ; EG-NEXT: MOV T22.Z, T11.W,
3571 ; EG-NEXT: ASHR T26.W, T18.Y, literal.x,
3572 ; EG-NEXT: MOV * T25.X, T18.Z,
3573 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3574 ; EG-NEXT: ASHR T26.Y, T18.X, literal.x,
3575 ; EG-NEXT: MOV T24.Z, T11.Y,
3576 ; EG-NEXT: ASHR * T11.W, T17.W, literal.x,
3577 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3578 ; EG-NEXT: MOV T26.X, T18.X,
3579 ; EG-NEXT: ASHR T11.Y, T17.Z, literal.x,
3580 ; EG-NEXT: MOV T25.Z, T18.W,
3581 ; EG-NEXT: ASHR T27.W, T17.Y, literal.x,
3582 ; EG-NEXT: MOV * T11.X, T17.Z,
3583 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3584 ; EG-NEXT: ASHR T27.Y, T17.X, literal.x,
3585 ; EG-NEXT: MOV T26.Z, T18.Y,
3586 ; EG-NEXT: ASHR * T18.W, T16.W, literal.x,
3587 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3588 ; EG-NEXT: MOV T27.X, T17.X,
3589 ; EG-NEXT: ASHR T18.Y, T16.Z, literal.x,
3590 ; EG-NEXT: MOV T11.Z, T17.W,
3591 ; EG-NEXT: ASHR T28.W, T16.Y, literal.x,
3592 ; EG-NEXT: MOV * T18.X, T16.Z,
3593 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3594 ; EG-NEXT: ASHR T28.Y, T16.X, literal.x,
3595 ; EG-NEXT: MOV T27.Z, T17.Y,
3596 ; EG-NEXT: ASHR * T17.W, T15.W, literal.x,
3597 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3598 ; EG-NEXT: MOV T28.X, T16.X,
3599 ; EG-NEXT: ASHR T17.Y, T15.Z, literal.x,
3600 ; EG-NEXT: MOV T18.Z, T16.W,
3601 ; EG-NEXT: ASHR T29.W, T15.Y, literal.x,
3602 ; EG-NEXT: MOV * T17.X, T15.Z,
3603 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3604 ; EG-NEXT: ASHR T29.Y, T15.X, literal.x,
3605 ; EG-NEXT: MOV T28.Z, T16.Y,
3606 ; EG-NEXT: ASHR * T16.W, T14.W, literal.x,
3607 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3608 ; EG-NEXT: MOV T29.X, T15.X,
3609 ; EG-NEXT: ASHR T16.Y, T14.Z, literal.x,
3610 ; EG-NEXT: MOV T17.Z, T15.W,
3611 ; EG-NEXT: ASHR T30.W, T14.Y, literal.x,
3612 ; EG-NEXT: MOV * T16.X, T14.Z,
3613 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3614 ; EG-NEXT: ASHR T30.Y, T14.X, literal.x,
3615 ; EG-NEXT: MOV T29.Z, T15.Y,
3616 ; EG-NEXT: ASHR * T15.W, T13.W, literal.x,
3617 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3618 ; EG-NEXT: MOV T30.X, T14.X,
3619 ; EG-NEXT: ASHR T15.Y, T13.Z, literal.x,
3620 ; EG-NEXT: MOV T16.Z, T14.W,
3621 ; EG-NEXT: ASHR T31.W, T13.Y, literal.x,
3622 ; EG-NEXT: MOV * T15.X, T13.Z,
3623 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3624 ; EG-NEXT: ASHR T31.Y, T13.X, literal.x,
3625 ; EG-NEXT: MOV T30.Z, T14.Y,
3626 ; EG-NEXT: ASHR * T14.W, T12.W, literal.x,
3627 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3628 ; EG-NEXT: MOV T31.X, T13.X,
3629 ; EG-NEXT: ASHR T14.Y, T12.Z, literal.x,
3630 ; EG-NEXT: MOV T15.Z, T13.W,
3631 ; EG-NEXT: ASHR T32.W, T12.Y, literal.x,
3632 ; EG-NEXT: MOV * T14.X, T12.Z,
3633 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3634 ; EG-NEXT: ASHR T32.Y, T12.X, literal.x,
3635 ; EG-NEXT: MOV * T31.Z, T13.Y,
3636 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3637 ; EG-NEXT: MOV T32.X, T12.X,
3638 ; EG-NEXT: MOV T14.Z, T12.W,
3639 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3640 ; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
3641 ; EG-NEXT: LSHR T12.X, PV.W, literal.x,
3642 ; EG-NEXT: MOV * T32.Z, T12.Y,
3643 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3645 ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3646 ; GCN-GFX900-HSA: ; %bb.0:
3647 ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3]
3648 ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1]
3649 ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3650 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
3651 ; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
3652 ; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
3653 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
3654 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
3655 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
3656 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
3657 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
3658 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
3659 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
3660 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
3661 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3
3662 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2
3663 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v2
3664 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v3
3665 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(4)
3666 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
3667 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
3668 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5
3669 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4
3670 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v4
3671 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v5
3672 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6
3673 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7
3674 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
3675 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
3676 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
3677 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
3678 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
3679 ; GCN-GFX900-HSA-NEXT: s_nop 0
3680 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
3681 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
3682 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
3683 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
3684 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
3685 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
3686 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10
3687 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9
3688 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v9
3689 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v10
3690 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v11
3691 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v12
3692 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6)
3693 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16
3694 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15
3695 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v14
3696 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v13
3697 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v13
3698 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v14
3699 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v9, v15
3700 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v11, v16
3701 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
3702 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
3703 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
3704 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v18
3705 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v17
3706 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v17
3707 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v18
3708 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19
3709 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
3710 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20
3711 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
3712 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
3713 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
3714 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
3715 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
3716 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21
3717 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22
3718 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23
3719 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24
3720 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3]
3721 ; GCN-GFX900-HSA-NEXT: s_nop 0
3722 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
3723 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
3724 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
3725 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
3726 ; GCN-GFX900-HSA-NEXT: s_nop 0
3727 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
3728 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
3729 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
3730 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
3731 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
3732 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
3733 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v50
3734 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v49
3735 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v49
3736 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v50
3737 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v51
3738 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v52
3739 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
3740 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v24
3741 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v23
3742 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v22
3743 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v21
3744 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v21
3745 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v22
3746 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
3747 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
3748 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
3749 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
3750 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
3751 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
3752 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
3753 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
3754 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
3755 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
3756 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
3757 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
3758 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
3759 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v23
3760 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v24
3761 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
3762 ; GCN-GFX900-HSA-NEXT: s_endpgm
3764 ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3765 ; GCN-GFX908-HSA: ; %bb.0:
3766 ; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3767 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0
3768 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0)
3769 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
3770 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
3771 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
3772 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
3773 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
3774 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
3775 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
3776 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6)
3777 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v2
3778 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3
3779 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2
3780 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v3
3781 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v25
3782 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v26
3783 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v27
3784 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v28
3785 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(4)
3786 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
3787 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
3788 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10
3789 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9
3790 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v37, v9
3791 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v39, v10
3792 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v11
3793 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v12
3794 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
3795 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16
3796 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15
3797 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v14
3798 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v13
3799 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v41, v13
3800 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v14
3801 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v9, v15
3802 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v11, v16
3803 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2)
3804 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
3805 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
3806 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v18
3807 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v17
3808 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v45, v17
3809 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v47, v18
3810 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v13, v19
3811 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v15, v20
3812 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1)
3813 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
3814 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
3815 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
3816 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
3817 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v53, v21
3818 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v55, v22
3819 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v23
3820 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24
3821 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3]
3822 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
3823 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5
3824 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4
3825 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v4
3826 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v5
3827 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
3828 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6
3829 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7
3830 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
3831 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
3832 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a3
3833 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
3834 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
3835 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v0
3836 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v1
3837 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a2
3838 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a1
3839 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v32, a0
3840 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
3841 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
3842 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
3843 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v50
3844 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v49
3845 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v49
3846 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v50
3847 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v51
3848 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v52
3849 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
3850 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
3851 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v24
3852 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v23
3853 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v22
3854 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v21
3855 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v21
3856 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v22
3857 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
3858 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
3859 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
3860 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
3861 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
3862 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
3863 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
3864 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
3865 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
3866 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
3867 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
3868 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
3869 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v28, v23
3870 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v30, v24
3871 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
3872 ; GCN-GFX908-HSA-NEXT: s_endpgm
3873 %ld = load <32 x i32>, ptr addrspace(1) %in
3874 %ext = sext <32 x i32> %ld to <32 x i64>
3875 store <32 x i64> %ext, ptr addrspace(1) %out
3879 define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3880 ; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
3881 ; SI-NOHSA: ; %bb.0:
3882 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
3883 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
3884 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
3885 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
3886 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
3887 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
3888 ; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1
3889 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
3890 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
3891 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
3892 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
3893 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
3894 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
3895 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
3896 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
3897 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
3898 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3899 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
3900 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
3901 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
3902 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
3903 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
3904 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
3905 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
3906 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3907 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
3908 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
3909 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
3910 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0)
3911 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8
3912 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9
3913 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
3914 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3915 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10
3916 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11
3917 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
3918 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
3919 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32
3920 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v33
3921 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
3922 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3923 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v34
3924 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v35
3925 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
3926 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3927 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v28
3928 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v29
3929 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
3930 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3931 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v30
3932 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v31
3933 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
3934 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3935 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v24
3936 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v25
3937 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
3938 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3939 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v26
3940 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v27
3941 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3942 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3943 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v20
3944 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v21
3945 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
3946 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3947 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v22
3948 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v23
3949 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
3950 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3951 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v16
3952 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v17
3953 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
3954 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3955 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18
3956 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19
3957 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
3958 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3959 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12
3960 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13
3961 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3962 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3963 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14
3964 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15
3965 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3966 ; SI-NOHSA-NEXT: s_endpgm
3968 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
3969 ; GCNX3-HSA: ; %bb.0:
3970 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3971 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
3972 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
3973 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3974 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
3975 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3976 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
3977 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3978 ; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48
3979 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
3980 ; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0
3981 ; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64
3982 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0
3983 ; GCNX3-HSA-NEXT: s_add_u32 s12, s2, 0x50
3984 ; GCNX3-HSA-NEXT: s_addc_u32 s13, s3, 0
3985 ; GCNX3-HSA-NEXT: s_add_u32 s14, s2, 0x60
3986 ; GCNX3-HSA-NEXT: s_addc_u32 s15, s3, 0
3987 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x70
3988 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
3989 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3990 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3991 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1]
3992 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s14
3993 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s15
3994 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1]
3995 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s12
3996 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s13
3997 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
3998 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s10
3999 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s11
4000 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1]
4001 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8
4002 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9
4003 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
4004 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
4005 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
4006 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
4007 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6
4008 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
4009 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
4010 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
4011 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
4012 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1
4013 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4014 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4015 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28
4016 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29
4017 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1
4018 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0
4019 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
4020 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
4021 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
4022 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0
4023 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4024 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30
4025 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31
4026 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
4027 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
4028 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0
4029 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4030 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
4031 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
4032 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
4033 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0
4034 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4035 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8)
4036 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32
4037 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33
4038 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
4039 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
4040 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
4041 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0
4042 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4043 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3
4044 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2
4045 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0
4046 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4047 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34
4048 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35
4049 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3
4050 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2
4051 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0
4052 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4053 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
4054 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
4055 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
4056 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80
4057 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4058 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
4059 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24
4060 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25
4061 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
4062 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
4063 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
4064 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4065 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
4066 ; GCNX3-HSA-NEXT: s_nop 0
4067 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26
4068 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27
4069 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
4070 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
4071 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
4072 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4073 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
4074 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
4075 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
4076 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20
4077 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21
4078 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3]
4079 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
4080 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22
4081 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23
4082 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
4083 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
4084 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4085 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11)
4086 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16
4087 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17
4088 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
4089 ; GCNX3-HSA-NEXT: s_nop 0
4090 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18
4091 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19
4092 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
4093 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
4094 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8
4095 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9
4096 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
4097 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
4098 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
4099 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
4100 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4101 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10
4102 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11
4103 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
4104 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
4105 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
4106 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
4107 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
4108 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12
4109 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13
4110 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4111 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
4112 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
4113 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
4114 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
4115 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14
4116 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15
4117 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4118 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
4119 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
4120 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4
4121 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5
4122 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
4123 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
4124 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
4125 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4126 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
4127 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6
4128 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v7
4129 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
4130 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4131 ; GCNX3-HSA-NEXT: s_endpgm
4133 ; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
4134 ; GCNX3-NOHSA: ; %bb.0:
4135 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
4136 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
4137 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
4138 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
4139 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
4140 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
4141 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
4142 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
4143 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
4144 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
4145 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
4146 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
4147 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
4148 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
4149 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
4150 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0
4151 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v29, 0
4152 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v31, v29
4153 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
4154 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
4155 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4156 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v0
4157 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v1
4158 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224
4159 ; GCNX3-NOHSA-NEXT: s_nop 0
4160 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v2
4161 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v3
4162 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240
4163 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(8)
4164 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v4
4165 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v5
4166 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192
4167 ; GCNX3-NOHSA-NEXT: s_nop 0
4168 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v6
4169 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v7
4170 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208
4171 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(9)
4172 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8
4173 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9
4174 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
4175 ; GCNX3-NOHSA-NEXT: s_nop 0
4176 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v10
4177 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v11
4178 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
4179 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(10)
4180 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v12
4181 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v13
4182 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128
4183 ; GCNX3-NOHSA-NEXT: s_nop 0
4184 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v14
4185 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v15
4186 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
4187 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(11)
4188 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v16
4189 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v17
4190 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
4191 ; GCNX3-NOHSA-NEXT: s_nop 0
4192 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v18
4193 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v19
4194 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
4195 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(12)
4196 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v20
4197 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v21
4198 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
4199 ; GCNX3-NOHSA-NEXT: s_nop 0
4200 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v22
4201 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v23
4202 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
4203 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(13)
4204 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v24
4205 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v25
4206 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
4207 ; GCNX3-NOHSA-NEXT: s_nop 0
4208 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v26
4209 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v27
4210 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
4211 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(14)
4212 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v32
4213 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v33
4214 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
4215 ; GCNX3-NOHSA-NEXT: s_nop 0
4216 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v34
4217 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v35
4218 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
4219 ; GCNX3-NOHSA-NEXT: s_endpgm
4221 ; EG-LABEL: global_zextload_v32i32_to_v32i64:
4223 ; EG-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
4224 ; EG-NEXT: TEX 2 @22
4225 ; EG-NEXT: ALU 10, @39, KC0[], KC1[]
4226 ; EG-NEXT: TEX 4 @28
4227 ; EG-NEXT: ALU 100, @50, KC0[CB0:0-32], KC1[]
4228 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0
4229 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0
4230 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0
4231 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0
4232 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0
4233 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0
4234 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
4235 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
4236 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0
4237 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
4238 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0
4239 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0
4240 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0
4241 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0
4242 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0
4243 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1
4245 ; EG-NEXT: Fetch clause starting at 22:
4246 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 112, #1
4247 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 80, #1
4248 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 96, #1
4249 ; EG-NEXT: Fetch clause starting at 28:
4250 ; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1
4251 ; EG-NEXT: VTX_READ_128 T11.XYZW, T0.X, 16, #1
4252 ; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 32, #1
4253 ; EG-NEXT: VTX_READ_128 T13.XYZW, T0.X, 48, #1
4254 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 64, #1
4255 ; EG-NEXT: ALU clause starting at 38:
4256 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
4257 ; EG-NEXT: ALU clause starting at 39:
4258 ; EG-NEXT: MOV T4.X, T1.X,
4259 ; EG-NEXT: MOV T4.Y, 0.0,
4260 ; EG-NEXT: MOV * T5.X, T1.Z,
4261 ; EG-NEXT: MOV * T5.Y, 0.0,
4262 ; EG-NEXT: MOV T6.X, T3.X,
4263 ; EG-NEXT: MOV T6.Y, 0.0,
4264 ; EG-NEXT: MOV * T7.X, T3.Z,
4265 ; EG-NEXT: MOV * T7.Y, 0.0,
4266 ; EG-NEXT: MOV T8.X, T2.X,
4267 ; EG-NEXT: MOV T8.Y, 0.0,
4268 ; EG-NEXT: MOV * T9.X, T2.Z,
4269 ; EG-NEXT: ALU clause starting at 50:
4270 ; EG-NEXT: MOV * T9.Y, 0.0,
4271 ; EG-NEXT: MOV T14.X, T0.X,
4272 ; EG-NEXT: MOV T14.Y, 0.0,
4273 ; EG-NEXT: MOV * T15.X, T0.Z,
4274 ; EG-NEXT: MOV * T15.Y, 0.0,
4275 ; EG-NEXT: MOV T16.X, T13.X,
4276 ; EG-NEXT: MOV T16.Y, 0.0,
4277 ; EG-NEXT: MOV * T17.X, T13.Z,
4278 ; EG-NEXT: MOV * T17.Y, 0.0,
4279 ; EG-NEXT: MOV T18.X, T12.X,
4280 ; EG-NEXT: MOV T18.Y, 0.0,
4281 ; EG-NEXT: MOV * T19.X, T12.Z,
4282 ; EG-NEXT: MOV * T19.Y, 0.0,
4283 ; EG-NEXT: MOV T20.X, T11.X,
4284 ; EG-NEXT: MOV T20.Y, 0.0,
4285 ; EG-NEXT: MOV * T21.X, T11.Z,
4286 ; EG-NEXT: MOV * T21.Y, 0.0,
4287 ; EG-NEXT: MOV T22.X, T10.X,
4288 ; EG-NEXT: MOV T22.Y, 0.0,
4289 ; EG-NEXT: MOV * T23.X, T10.Z,
4290 ; EG-NEXT: MOV T23.Y, 0.0,
4291 ; EG-NEXT: MOV T4.Z, T1.Y,
4292 ; EG-NEXT: MOV T4.W, 0.0,
4293 ; EG-NEXT: MOV * T5.Z, T1.W,
4294 ; EG-NEXT: MOV * T5.W, 0.0,
4295 ; EG-NEXT: MOV T6.Z, T3.Y,
4296 ; EG-NEXT: MOV T6.W, 0.0,
4297 ; EG-NEXT: MOV * T7.Z, T3.W,
4298 ; EG-NEXT: MOV * T7.W, 0.0,
4299 ; EG-NEXT: MOV T8.Z, T2.Y,
4300 ; EG-NEXT: MOV T8.W, 0.0,
4301 ; EG-NEXT: MOV * T9.Z, T2.W,
4302 ; EG-NEXT: MOV * T9.W, 0.0,
4303 ; EG-NEXT: MOV T14.Z, T0.Y,
4304 ; EG-NEXT: MOV T14.W, 0.0,
4305 ; EG-NEXT: MOV * T15.Z, T0.W,
4306 ; EG-NEXT: MOV * T15.W, 0.0,
4307 ; EG-NEXT: MOV T16.Z, T13.Y,
4308 ; EG-NEXT: MOV T16.W, 0.0,
4309 ; EG-NEXT: MOV * T17.Z, T13.W,
4310 ; EG-NEXT: MOV * T17.W, 0.0,
4311 ; EG-NEXT: MOV T18.Z, T12.Y,
4312 ; EG-NEXT: MOV T18.W, 0.0,
4313 ; EG-NEXT: MOV * T19.Z, T12.W,
4314 ; EG-NEXT: MOV * T19.W, 0.0,
4315 ; EG-NEXT: MOV T20.Z, T11.Y,
4316 ; EG-NEXT: MOV T20.W, 0.0,
4317 ; EG-NEXT: MOV * T21.Z, T11.W,
4318 ; EG-NEXT: MOV * T21.W, 0.0,
4319 ; EG-NEXT: MOV T22.Z, T10.Y,
4320 ; EG-NEXT: MOV T22.W, 0.0,
4321 ; EG-NEXT: MOV * T23.Z, T10.W,
4322 ; EG-NEXT: MOV T23.W, 0.0,
4323 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4324 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4325 ; EG-NEXT: LSHR T0.X, PS, literal.x,
4326 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4327 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4328 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4329 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4330 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
4331 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4332 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
4333 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
4334 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4335 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
4336 ; EG-NEXT: LSHR T10.X, PV.W, literal.x,
4337 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4338 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
4339 ; EG-NEXT: LSHR T11.X, PV.W, literal.x,
4340 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4341 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
4342 ; EG-NEXT: LSHR T12.X, PV.W, literal.x,
4343 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4344 ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
4345 ; EG-NEXT: LSHR T13.X, PV.W, literal.x,
4346 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4347 ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
4348 ; EG-NEXT: LSHR T24.X, PV.W, literal.x,
4349 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4350 ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
4351 ; EG-NEXT: LSHR T25.X, PV.W, literal.x,
4352 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4353 ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
4354 ; EG-NEXT: LSHR T26.X, PV.W, literal.x,
4355 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4356 ; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
4357 ; EG-NEXT: LSHR T27.X, PV.W, literal.x,
4358 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4359 ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
4360 ; EG-NEXT: LSHR T28.X, PV.W, literal.x,
4361 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4362 ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
4363 ; EG-NEXT: LSHR T29.X, PV.W, literal.x,
4364 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4365 ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
4366 ; EG-NEXT: LSHR T30.X, PV.W, literal.x,
4367 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4368 ; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
4369 ; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
4370 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4372 ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64:
4374 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4375 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
4376 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
4377 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
4378 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:112
4379 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:96
4380 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:80
4381 ; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:64
4382 ; GCN-HSA-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:48
4383 ; GCN-HSA-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:32
4384 ; GCN-HSA-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:16
4385 ; GCN-HSA-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3]
4386 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4387 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
4388 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
4389 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:224
4390 ; GCN-HSA-NEXT: s_nop 0
4391 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
4392 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
4393 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:240
4394 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
4395 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8
4396 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9
4397 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:192
4398 ; GCN-HSA-NEXT: s_nop 0
4399 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
4400 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
4401 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:208
4402 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
4403 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v12
4404 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v13
4405 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:160
4406 ; GCN-HSA-NEXT: s_nop 0
4407 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v14
4408 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v15
4409 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:176
4410 ; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
4411 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v16
4412 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v17
4413 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:128
4414 ; GCN-HSA-NEXT: s_nop 0
4415 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v18
4416 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v19
4417 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:144
4418 ; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
4419 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v20
4420 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v21
4421 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
4422 ; GCN-HSA-NEXT: s_nop 0
4423 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v22
4424 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v23
4425 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
4426 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
4427 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v24
4428 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v25
4429 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
4430 ; GCN-HSA-NEXT: s_nop 0
4431 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v26
4432 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v27
4433 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
4434 ; GCN-HSA-NEXT: s_waitcnt vmcnt(13)
4435 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v28
4436 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v29
4437 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
4438 ; GCN-HSA-NEXT: s_nop 0
4439 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v30
4440 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v31
4441 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
4442 ; GCN-HSA-NEXT: s_waitcnt vmcnt(14)
4443 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v32
4444 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v33
4445 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
4446 ; GCN-HSA-NEXT: s_nop 0
4447 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v34
4448 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v35
4449 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
4450 ; GCN-HSA-NEXT: s_endpgm
4451 %ld = load <32 x i32>, ptr addrspace(1) %in
4452 %ext = zext <32 x i32> %ld to <32 x i64>
4453 store <32 x i64> %ext, ptr addrspace(1) %out
4457 define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
4458 ; SI-NOHSA-LABEL: global_load_v32i32:
4459 ; SI-NOHSA: ; %bb.0:
4460 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4461 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
4462 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
4463 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
4464 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
4465 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
4466 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
4467 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
4468 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
4469 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
4470 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
4471 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
4472 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
4473 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
4474 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
4475 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
4476 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
4477 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
4478 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
4479 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
4480 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
4481 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
4482 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
4483 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
4484 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
4485 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
4486 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
4487 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
4488 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
4489 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
4490 ; SI-NOHSA-NEXT: s_endpgm
4492 ; GCNX3-HSA-LABEL: global_load_v32i32:
4493 ; GCNX3-HSA: ; %bb.0:
4494 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4495 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
4496 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
4497 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4498 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48
4499 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
4500 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5
4501 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
4502 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
4503 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4
4504 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
4505 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
4506 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4507 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
4508 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
4509 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50
4510 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4511 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
4512 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
4513 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
4514 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4515 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5
4516 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4
4517 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70
4518 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6
4519 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4520 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7
4521 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60
4522 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
4523 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
4524 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
4525 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5
4526 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
4527 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4
4528 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
4529 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
4530 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
4531 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
4532 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
4533 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
4534 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
4535 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1
4536 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4537 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0
4538 ; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70
4539 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0
4540 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4541 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
4542 ; GCNX3-HSA-NEXT: s_nop 0
4543 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
4544 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
4545 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
4546 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4547 ; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 0x50
4548 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0
4549 ; GCNX3-HSA-NEXT: s_add_u32 s8, s0, 32
4550 ; GCNX3-HSA-NEXT: s_addc_u32 s9, s1, 0
4551 ; GCNX3-HSA-NEXT: s_add_u32 s10, s0, 48
4552 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s1, 0
4553 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s10
4554 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s11
4555 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
4556 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
4557 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4558 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
4559 ; GCNX3-HSA-NEXT: s_nop 0
4560 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s8
4561 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s9
4562 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4563 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11]
4564 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6
4565 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4
4566 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
4567 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7
4568 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
4569 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5
4570 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
4571 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
4572 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4573 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15]
4574 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4575 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
4576 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4577 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23]
4578 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4579 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
4580 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4581 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[28:31]
4582 ; GCNX3-HSA-NEXT: s_endpgm
4584 ; GCNX3-NOHSA-LABEL: global_load_v32i32:
4585 ; GCNX3-NOHSA: ; %bb.0:
4586 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4587 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
4588 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
4589 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
4590 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
4591 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
4592 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
4593 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
4594 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
4595 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
4596 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
4597 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
4598 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
4599 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
4600 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
4601 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
4602 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
4603 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
4604 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
4605 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
4606 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
4607 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
4608 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
4609 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
4610 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4611 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
4612 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4613 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
4614 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4615 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
4616 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4617 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
4618 ; GCNX3-NOHSA-NEXT: s_endpgm
4620 ; EG-LABEL: global_load_v32i32:
4622 ; EG-NEXT: ALU 23, @28, KC0[CB0:0-32], KC1[]
4623 ; EG-NEXT: TEX 7 @12
4624 ; EG-NEXT: ALU 1, @52, KC0[], KC1[]
4625 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
4626 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
4627 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
4628 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
4629 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T3.X, 0
4630 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T2.X, 0
4631 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T1.X, 0
4632 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
4634 ; EG-NEXT: Fetch clause starting at 12:
4635 ; EG-NEXT: VTX_READ_128 T8.XYZW, T7.X, 96, #1
4636 ; EG-NEXT: VTX_READ_128 T9.XYZW, T7.X, 112, #1
4637 ; EG-NEXT: VTX_READ_128 T10.XYZW, T7.X, 64, #1
4638 ; EG-NEXT: VTX_READ_128 T11.XYZW, T7.X, 80, #1
4639 ; EG-NEXT: VTX_READ_128 T12.XYZW, T7.X, 32, #1
4640 ; EG-NEXT: VTX_READ_128 T13.XYZW, T7.X, 48, #1
4641 ; EG-NEXT: VTX_READ_128 T14.XYZW, T7.X, 0, #1
4642 ; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 16, #1
4643 ; EG-NEXT: ALU clause starting at 28:
4644 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4645 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4646 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
4647 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4648 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4649 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4650 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4651 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
4652 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4653 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
4654 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
4655 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4656 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
4657 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
4658 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4659 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
4660 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
4661 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4662 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
4663 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
4664 ; EG-NEXT: MOV * T7.X, KC0[2].Z,
4665 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4666 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4667 ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
4668 ; EG-NEXT: ALU clause starting at 52:
4669 ; EG-NEXT: LSHR * T15.X, T0.W, literal.x,
4670 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4672 ; GCN-HSA-LABEL: global_load_v32i32:
4674 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4675 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0
4676 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
4677 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96
4678 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:112
4679 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:64
4680 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:80
4681 ; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:32
4682 ; GCN-HSA-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:48
4683 ; GCN-HSA-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3]
4684 ; GCN-HSA-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:16
4685 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4686 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
4687 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4688 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
4689 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4690 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
4691 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4692 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
4693 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4694 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
4695 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4696 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
4697 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4698 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
4699 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4700 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16
4701 ; GCN-HSA-NEXT: s_endpgm
4702 %ld = load <32 x i32>, ptr addrspace(1) %in
4703 store <32 x i32> %ld, ptr addrspace(1) %out
4707 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }