1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-NOHSA %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCNX3-HSA %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
9 define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
10 ; SI-NOHSA-LABEL: global_load_i32:
11 ; SI-NOHSA: ; %bb.0: ; %entry
12 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
13 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
14 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
15 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
16 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
17 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
19 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
20 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
21 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
22 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
23 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
25 ; SI-NOHSA-NEXT: s_endpgm
27 ; GCNX3-HSA-LABEL: global_load_i32:
28 ; GCNX3-HSA: ; %bb.0: ; %entry
29 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
30 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
31 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
33 ; GCNX3-HSA-NEXT: flat_load_dword v2, v[0:1]
34 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
35 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
36 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
37 ; GCNX3-HSA-NEXT: flat_store_dword v[0:1], v2
38 ; GCNX3-HSA-NEXT: s_endpgm
40 ; GCNX3-NOHSA-LABEL: global_load_i32:
41 ; GCNX3-NOHSA: ; %bb.0: ; %entry
42 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
43 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
44 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
45 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
46 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
47 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
48 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
49 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
50 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
51 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
52 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
53 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
54 ; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
55 ; GCNX3-NOHSA-NEXT: s_endpgm
57 ; EG-LABEL: global_load_i32:
58 ; EG: ; %bb.0: ; %entry
59 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
61 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
62 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
65 ; EG-NEXT: Fetch clause starting at 6:
66 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
67 ; EG-NEXT: ALU clause starting at 8:
68 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
69 ; EG-NEXT: ALU clause starting at 9:
70 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
71 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
73 ; GCN-HSA-LABEL: global_load_i32:
74 ; GCN-HSA: ; %bb.0: ; %entry
75 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
76 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0
77 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
78 ; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3]
79 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
80 ; GCN-HSA-NEXT: global_store_dword v0, v1, s[0:1]
81 ; GCN-HSA-NEXT: s_endpgm
83 %ld = load i32, ptr addrspace(1) %in
84 store i32 %ld, ptr addrspace(1) %out
88 define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
89 ; SI-NOHSA-LABEL: global_load_v2i32:
90 ; SI-NOHSA: ; %bb.0: ; %entry
91 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
92 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
93 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
94 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
95 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
96 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
97 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
98 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
99 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
100 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
101 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
102 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
103 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
104 ; SI-NOHSA-NEXT: s_endpgm
106 ; GCNX3-HSA-LABEL: global_load_v2i32:
107 ; GCNX3-HSA: ; %bb.0: ; %entry
108 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
109 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
110 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
111 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
112 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
113 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
114 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
115 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
116 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
117 ; GCNX3-HSA-NEXT: s_endpgm
119 ; GCNX3-NOHSA-LABEL: global_load_v2i32:
120 ; GCNX3-NOHSA: ; %bb.0: ; %entry
121 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
122 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
123 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
124 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
125 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
126 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
127 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
128 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
129 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
130 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
131 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
132 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
133 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
134 ; GCNX3-NOHSA-NEXT: s_endpgm
136 ; EG-LABEL: global_load_v2i32:
137 ; EG: ; %bb.0: ; %entry
138 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
140 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
141 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
144 ; EG-NEXT: Fetch clause starting at 6:
145 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
146 ; EG-NEXT: ALU clause starting at 8:
147 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
148 ; EG-NEXT: ALU clause starting at 9:
149 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
150 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
152 ; GCN-HSA-LABEL: global_load_v2i32:
153 ; GCN-HSA: ; %bb.0: ; %entry
154 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
155 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
156 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
157 ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
158 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
159 ; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
160 ; GCN-HSA-NEXT: s_endpgm
162 %ld = load <2 x i32>, ptr addrspace(1) %in
163 store <2 x i32> %ld, ptr addrspace(1) %out
167 define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
168 ; SI-NOHSA-LABEL: global_load_v3i32:
169 ; SI-NOHSA: ; %bb.0: ; %entry
170 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
171 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
172 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
173 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
174 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
175 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
176 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
177 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
178 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
179 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
180 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
181 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
182 ; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
183 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
184 ; SI-NOHSA-NEXT: s_endpgm
186 ; GCNX3-HSA-LABEL: global_load_v3i32:
187 ; GCNX3-HSA: ; %bb.0: ; %entry
188 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
189 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
190 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
191 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
192 ; GCNX3-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
193 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s0
194 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s1
195 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
196 ; GCNX3-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
197 ; GCNX3-HSA-NEXT: s_endpgm
199 ; GCNX3-NOHSA-LABEL: global_load_v3i32:
200 ; GCNX3-NOHSA: ; %bb.0: ; %entry
201 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
202 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
203 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
204 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
205 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
206 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
207 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
208 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
209 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
210 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
211 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
212 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
213 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
214 ; GCNX3-NOHSA-NEXT: s_endpgm
216 ; EG-LABEL: global_load_v3i32:
217 ; EG: ; %bb.0: ; %entry
218 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
220 ; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
221 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
222 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
224 ; EG-NEXT: Fetch clause starting at 6:
225 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
226 ; EG-NEXT: ALU clause starting at 8:
227 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
228 ; EG-NEXT: ALU clause starting at 9:
229 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
230 ; EG-NEXT: MOV * T2.X, T0.Z,
231 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
232 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
233 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
234 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
235 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
237 ; GCN-HSA-LABEL: global_load_v3i32:
238 ; GCN-HSA: ; %bb.0: ; %entry
239 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
240 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0
241 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
242 ; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
243 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
244 ; GCN-HSA-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
245 ; GCN-HSA-NEXT: s_endpgm
247 %ld = load <3 x i32>, ptr addrspace(1) %in
248 store <3 x i32> %ld, ptr addrspace(1) %out
252 define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
253 ; SI-NOHSA-LABEL: global_load_v4i32:
254 ; SI-NOHSA: ; %bb.0: ; %entry
255 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
256 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
257 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
258 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
259 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
260 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
261 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
262 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
263 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
264 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
265 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
266 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
267 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
268 ; SI-NOHSA-NEXT: s_endpgm
270 ; GCNX3-HSA-LABEL: global_load_v4i32:
271 ; GCNX3-HSA: ; %bb.0: ; %entry
272 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
273 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
274 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
275 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
276 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
277 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
278 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
279 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
280 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
281 ; GCNX3-HSA-NEXT: s_endpgm
283 ; GCNX3-NOHSA-LABEL: global_load_v4i32:
284 ; GCNX3-NOHSA: ; %bb.0: ; %entry
285 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
286 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
287 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
288 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
289 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
290 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
291 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
292 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
293 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
294 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
295 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
296 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
297 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
298 ; GCNX3-NOHSA-NEXT: s_endpgm
300 ; EG-LABEL: global_load_v4i32:
301 ; EG: ; %bb.0: ; %entry
302 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
304 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
305 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
308 ; EG-NEXT: Fetch clause starting at 6:
309 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
310 ; EG-NEXT: ALU clause starting at 8:
311 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
312 ; EG-NEXT: ALU clause starting at 9:
313 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
314 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
316 ; GCN-HSA-LABEL: global_load_v4i32:
317 ; GCN-HSA: ; %bb.0: ; %entry
318 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
319 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
320 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
321 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
322 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
323 ; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
324 ; GCN-HSA-NEXT: s_endpgm
326 %ld = load <4 x i32>, ptr addrspace(1) %in
327 store <4 x i32> %ld, ptr addrspace(1) %out
331 define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
332 ; SI-NOHSA-LABEL: global_load_v8i32:
333 ; SI-NOHSA: ; %bb.0: ; %entry
334 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
335 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
336 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
337 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
338 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
339 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
340 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
341 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
342 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
343 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
344 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
345 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
346 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
347 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
348 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
349 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
350 ; SI-NOHSA-NEXT: s_endpgm
352 ; GCNX3-HSA-LABEL: global_load_v8i32:
353 ; GCNX3-HSA: ; %bb.0: ; %entry
354 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
355 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
356 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
357 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
358 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
359 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
360 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
361 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
362 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
363 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
364 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
365 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
366 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
367 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
368 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
369 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0
370 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
371 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
372 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
373 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
374 ; GCNX3-HSA-NEXT: s_endpgm
376 ; GCNX3-NOHSA-LABEL: global_load_v8i32:
377 ; GCNX3-NOHSA: ; %bb.0: ; %entry
378 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
379 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
380 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
381 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
382 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
383 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
384 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
385 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
386 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
387 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
388 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
389 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
390 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
391 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
392 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
393 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
394 ; GCNX3-NOHSA-NEXT: s_endpgm
396 ; EG-LABEL: global_load_v8i32:
397 ; EG: ; %bb.0: ; %entry
398 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
400 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
401 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
402 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
404 ; EG-NEXT: Fetch clause starting at 6:
405 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
406 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
407 ; EG-NEXT: ALU clause starting at 10:
408 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
409 ; EG-NEXT: ALU clause starting at 11:
410 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
411 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
412 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
413 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
414 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
416 ; GCN-HSA-LABEL: global_load_v8i32:
417 ; GCN-HSA: ; %bb.0: ; %entry
418 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
419 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
420 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
421 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
422 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
423 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
424 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
425 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
426 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
427 ; GCN-HSA-NEXT: s_endpgm
429 %ld = load <8 x i32>, ptr addrspace(1) %in
430 store <8 x i32> %ld, ptr addrspace(1) %out
434 define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
435 ; SI-NOHSA-LABEL: global_load_v9i32:
436 ; SI-NOHSA: ; %bb.0: ; %entry
437 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
438 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
439 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
440 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
441 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
442 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
444 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
445 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
446 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
447 ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
448 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
449 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
450 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
451 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
452 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
453 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
454 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
455 ; SI-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
456 ; SI-NOHSA-NEXT: s_endpgm
458 ; GCNX3-HSA-LABEL: global_load_v9i32:
459 ; GCNX3-HSA: ; %bb.0: ; %entry
460 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
461 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
462 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
463 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
464 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
465 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
466 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
467 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
468 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
469 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
470 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
471 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
472 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
473 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
474 ; GCNX3-HSA-NEXT: flat_load_dword v14, v[8:9]
475 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
476 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
477 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
478 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
479 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
480 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
481 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s3
482 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
483 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s2
484 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
485 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
486 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
487 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
488 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
489 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
490 ; GCNX3-HSA-NEXT: flat_store_dword v[12:13], v14
491 ; GCNX3-HSA-NEXT: s_endpgm
493 ; GCNX3-NOHSA-LABEL: global_load_v9i32:
494 ; GCNX3-NOHSA: ; %bb.0: ; %entry
495 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
496 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
497 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
498 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
499 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
500 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
501 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
502 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
503 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
504 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
505 ; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
506 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
507 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
508 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
509 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
510 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
511 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
512 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
513 ; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
514 ; GCNX3-NOHSA-NEXT: s_endpgm
516 ; EG-LABEL: global_load_v9i32:
517 ; EG: ; %bb.0: ; %entry
518 ; EG-NEXT: ALU 8, @14, KC0[CB0:0-32], KC1[]
520 ; EG-NEXT: ALU 1, @23, KC0[CB0:0-32], KC1[]
521 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0
522 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
523 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T0.X, 1
526 ; EG-NEXT: Fetch clause starting at 8:
527 ; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 0, #1
528 ; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 16, #1
529 ; EG-NEXT: VTX_READ_32 T3.X, T3.X, 32, #1
530 ; EG-NEXT: ALU clause starting at 14:
531 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
532 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
533 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
534 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
535 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
536 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
537 ; EG-NEXT: MOV * T2.X, KC0[2].Z,
538 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
539 ; EG-NEXT: MOV * T3.X, PS,
540 ; EG-NEXT: ALU clause starting at 23:
541 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
542 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
544 ; GCN-HSA-LABEL: global_load_v9i32:
545 ; GCN-HSA: ; %bb.0: ; %entry
546 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
547 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
548 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
549 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
550 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
551 ; GCN-HSA-NEXT: global_load_dword v9, v8, s[2:3] offset:32
552 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
553 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
554 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
555 ; GCN-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
556 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
557 ; GCN-HSA-NEXT: global_store_dword v8, v9, s[0:1] offset:32
558 ; GCN-HSA-NEXT: s_endpgm
560 %ld = load <9 x i32>, ptr addrspace(1) %in
561 store <9 x i32> %ld, ptr addrspace(1) %out
565 define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
566 ; SI-NOHSA-LABEL: global_load_v10i32:
567 ; SI-NOHSA: ; %bb.0: ; %entry
568 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
569 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
570 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
571 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
572 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
573 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
574 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
575 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
576 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
577 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
578 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
579 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
580 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
581 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
582 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
583 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
584 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
585 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
586 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
587 ; SI-NOHSA-NEXT: s_endpgm
589 ; GCNX3-HSA-LABEL: global_load_v10i32:
590 ; GCNX3-HSA: ; %bb.0: ; %entry
591 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
592 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
593 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
594 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
595 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
596 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
597 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
598 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
599 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
600 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
601 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
602 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
603 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
604 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
605 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
606 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
607 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
608 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
609 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0
610 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
611 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
612 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
613 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1
614 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
615 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s0
616 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
617 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
618 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
619 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
620 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
621 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[14:15], v[8:9]
622 ; GCNX3-HSA-NEXT: s_endpgm
624 ; GCNX3-NOHSA-LABEL: global_load_v10i32:
625 ; GCNX3-NOHSA: ; %bb.0: ; %entry
626 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
627 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
628 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
629 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
630 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
631 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
632 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
633 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
634 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
635 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
636 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
637 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
638 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
639 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
640 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
641 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
642 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
643 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
644 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
645 ; GCNX3-NOHSA-NEXT: s_endpgm
647 ; EG-LABEL: global_load_v10i32:
648 ; EG: ; %bb.0: ; %entry
649 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
651 ; EG-NEXT: ALU 7, @15, KC0[CB0:0-32], KC1[]
652 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0
653 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
654 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
657 ; EG-NEXT: Fetch clause starting at 8:
658 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
659 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
660 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1
661 ; EG-NEXT: ALU clause starting at 14:
662 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
663 ; EG-NEXT: ALU clause starting at 15:
664 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
665 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
666 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
667 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
668 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
669 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
670 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
671 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
673 ; GCN-HSA-LABEL: global_load_v10i32:
674 ; GCN-HSA: ; %bb.0: ; %entry
675 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
676 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0
677 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
678 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3]
679 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v10, s[2:3] offset:16
680 ; GCN-HSA-NEXT: global_load_dwordx2 v[8:9], v10, s[2:3] offset:32
681 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
682 ; GCN-HSA-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
683 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
684 ; GCN-HSA-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
685 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
686 ; GCN-HSA-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
687 ; GCN-HSA-NEXT: s_endpgm
689 %ld = load <10 x i32>, ptr addrspace(1) %in
690 store <10 x i32> %ld, ptr addrspace(1) %out
694 define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
695 ; SI-NOHSA-LABEL: global_load_v11i32:
696 ; SI-NOHSA: ; %bb.0: ; %entry
697 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
698 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
699 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
700 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
701 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
702 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
703 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
704 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
705 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
706 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
707 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
708 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
709 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
710 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
711 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
712 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
713 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
714 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
715 ; SI-NOHSA-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:40
716 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
717 ; SI-NOHSA-NEXT: s_endpgm
719 ; GCNX3-HSA-LABEL: global_load_v11i32:
720 ; GCNX3-HSA: ; %bb.0: ; %entry
721 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
722 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
723 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
724 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
725 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
726 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
727 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
728 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
729 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
730 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
731 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
732 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
733 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
734 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
735 ; GCNX3-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9]
736 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
737 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
738 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
739 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0
740 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
741 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
742 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3
743 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1
744 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2
745 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s0
746 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
747 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
748 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
749 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7]
750 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
751 ; GCNX3-HSA-NEXT: flat_store_dwordx3 v[15:16], v[8:10]
752 ; GCNX3-HSA-NEXT: s_endpgm
754 ; GCNX3-NOHSA-LABEL: global_load_v11i32:
755 ; GCNX3-NOHSA: ; %bb.0: ; %entry
756 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
757 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
758 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
759 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
760 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
761 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
762 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
763 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
764 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
765 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
766 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
767 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
768 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
769 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
770 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
771 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
772 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
773 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
774 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
775 ; GCNX3-NOHSA-NEXT: s_endpgm
777 ; EG-LABEL: global_load_v11i32:
778 ; EG: ; %bb.0: ; %entry
779 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
781 ; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[]
782 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
783 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
784 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
785 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
787 ; EG-NEXT: Fetch clause starting at 8:
788 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
789 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
790 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1
791 ; EG-NEXT: ALU clause starting at 14:
792 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
793 ; EG-NEXT: ALU clause starting at 15:
794 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
795 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
796 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
797 ; EG-NEXT: MOV * T4.X, T0.Z,
798 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
799 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
800 ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
801 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
802 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
803 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
804 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
805 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
806 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
808 ; GCN-HSA-LABEL: global_load_v11i32:
809 ; GCN-HSA: ; %bb.0: ; %entry
810 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
811 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0
812 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
813 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3]
814 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v11, s[2:3] offset:16
815 ; GCN-HSA-NEXT: global_load_dwordx3 v[8:10], v11, s[2:3] offset:32
816 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
817 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1]
818 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
819 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] offset:16
820 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
821 ; GCN-HSA-NEXT: global_store_dwordx3 v11, v[8:10], s[0:1] offset:32
822 ; GCN-HSA-NEXT: s_endpgm
824 %ld = load <11 x i32>, ptr addrspace(1) %in
825 store <11 x i32> %ld, ptr addrspace(1) %out
830 define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
831 ; SI-NOHSA-LABEL: global_load_v12i32:
832 ; SI-NOHSA: ; %bb.0: ; %entry
833 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
834 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
835 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
836 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
837 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
838 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
839 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
840 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
841 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
842 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
843 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
844 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
845 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
846 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
847 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
848 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
849 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
850 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
851 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
852 ; SI-NOHSA-NEXT: s_endpgm
854 ; GCNX3-HSA-LABEL: global_load_v12i32:
855 ; GCNX3-HSA: ; %bb.0: ; %entry
856 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
857 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
858 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
859 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
860 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
861 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
862 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
863 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
864 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
865 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
866 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
867 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
868 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
869 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
870 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
871 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
872 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
873 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
874 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
875 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
876 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
877 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
878 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
879 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
880 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
881 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
882 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
883 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
884 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
885 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
886 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
887 ; GCNX3-HSA-NEXT: s_endpgm
889 ; GCNX3-NOHSA-LABEL: global_load_v12i32:
890 ; GCNX3-NOHSA: ; %bb.0: ; %entry
891 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
892 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
893 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
894 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
895 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
896 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
897 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
898 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
899 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
900 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
901 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
902 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
903 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
904 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
905 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
906 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
907 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
908 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
909 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
910 ; GCNX3-NOHSA-NEXT: s_endpgm
912 ; EG-LABEL: global_load_v12i32:
913 ; EG: ; %bb.0: ; %entry
914 ; EG-NEXT: ALU 7, @14, KC0[CB0:0-32], KC1[]
916 ; EG-NEXT: ALU 1, @22, KC0[CB0:0-32], KC1[]
917 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
918 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
919 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
922 ; EG-NEXT: Fetch clause starting at 8:
923 ; EG-NEXT: VTX_READ_128 T3.XYZW, T2.X, 0, #1
924 ; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 16, #1
925 ; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 32, #1
926 ; EG-NEXT: ALU clause starting at 14:
927 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
928 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
929 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
930 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
931 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
932 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
933 ; EG-NEXT: MOV * T2.X, KC0[2].Z,
934 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
935 ; EG-NEXT: ALU clause starting at 22:
936 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
937 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
939 ; GCN-HSA-LABEL: global_load_v12i32:
940 ; GCN-HSA: ; %bb.0: ; %entry
941 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
942 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0
943 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
944 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3]
945 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
946 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:32
947 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
948 ; GCN-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
949 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
950 ; GCN-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
951 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
952 ; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:32
953 ; GCN-HSA-NEXT: s_endpgm
955 %ld = load <12 x i32>, ptr addrspace(1) %in
956 store <12 x i32> %ld, ptr addrspace(1) %out
960 define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
961 ; SI-NOHSA-LABEL: global_load_v16i32:
962 ; SI-NOHSA: ; %bb.0: ; %entry
963 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
964 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
965 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
966 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
967 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
968 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
969 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
970 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
971 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
972 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
973 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
974 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
975 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
976 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
977 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
978 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
979 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
980 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
981 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
982 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
983 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
984 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
985 ; SI-NOHSA-NEXT: s_endpgm
987 ; GCNX3-HSA-LABEL: global_load_v16i32:
988 ; GCNX3-HSA: ; %bb.0: ; %entry
989 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
990 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
991 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
992 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
993 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
994 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
995 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
996 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
997 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
998 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
999 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 32
1000 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
1001 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
1002 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
1003 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
1004 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
1005 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1006 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1007 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
1008 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
1009 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
1010 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1011 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3
1012 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2
1013 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
1014 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
1015 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1016 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
1017 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
1018 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
1019 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
1020 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
1021 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
1022 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0
1023 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1024 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1025 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1026 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
1027 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1028 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
1029 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
1030 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
1031 ; GCNX3-HSA-NEXT: s_endpgm
1033 ; GCNX3-NOHSA-LABEL: global_load_v16i32:
1034 ; GCNX3-NOHSA: ; %bb.0: ; %entry
1035 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1036 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1037 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1038 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1039 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1040 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1041 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1042 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1043 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
1044 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
1045 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
1046 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
1047 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1048 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1049 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1050 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
1051 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1052 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
1053 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1054 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
1055 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
1056 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
1057 ; GCNX3-NOHSA-NEXT: s_endpgm
1059 ; EG-LABEL: global_load_v16i32:
1060 ; EG: ; %bb.0: ; %entry
1061 ; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[]
1063 ; EG-NEXT: ALU 1, @28, KC0[], KC1[]
1064 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
1065 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
1066 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
1067 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
1069 ; EG-NEXT: Fetch clause starting at 8:
1070 ; EG-NEXT: VTX_READ_128 T4.XYZW, T3.X, 32, #1
1071 ; EG-NEXT: VTX_READ_128 T5.XYZW, T3.X, 48, #1
1072 ; EG-NEXT: VTX_READ_128 T6.XYZW, T3.X, 0, #1
1073 ; EG-NEXT: VTX_READ_128 T3.XYZW, T3.X, 16, #1
1074 ; EG-NEXT: ALU clause starting at 16:
1075 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1076 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1077 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
1078 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1079 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1080 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1081 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
1082 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1083 ; EG-NEXT: MOV * T3.X, KC0[2].Z,
1084 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1085 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1086 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1087 ; EG-NEXT: ALU clause starting at 28:
1088 ; EG-NEXT: LSHR * T7.X, T0.W, literal.x,
1089 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1091 ; GCN-HSA-LABEL: global_load_v16i32:
1092 ; GCN-HSA: ; %bb.0: ; %entry
1093 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1094 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0
1095 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32
1097 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:48
1098 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3]
1099 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16
1100 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1101 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32
1102 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1103 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:48
1104 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1105 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1]
1106 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
1107 ; GCN-HSA-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16
1108 ; GCN-HSA-NEXT: s_endpgm
1110 %ld = load <16 x i32>, ptr addrspace(1) %in
1111 store <16 x i32> %ld, ptr addrspace(1) %out
1115 define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1116 ; SI-NOHSA-LABEL: global_zextload_i32_to_i64:
1117 ; SI-NOHSA: ; %bb.0:
1118 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1119 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1120 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1121 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1122 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1123 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1124 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1125 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1126 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1127 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1128 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1129 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1130 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1131 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1132 ; SI-NOHSA-NEXT: s_endpgm
1134 ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
1135 ; GCNX3-HSA: ; %bb.0:
1136 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1137 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1138 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1139 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1140 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1141 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1142 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1143 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
1144 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1145 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1146 ; GCNX3-HSA-NEXT: s_endpgm
1148 ; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
1149 ; GCNX3-NOHSA: ; %bb.0:
1150 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1151 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1152 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1153 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1154 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1155 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1156 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1157 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1158 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1159 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1160 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1161 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1162 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1163 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1164 ; GCNX3-NOHSA-NEXT: s_endpgm
1166 ; EG-LABEL: global_zextload_i32_to_i64:
1168 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1170 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1171 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1174 ; EG-NEXT: Fetch clause starting at 6:
1175 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1176 ; EG-NEXT: ALU clause starting at 8:
1177 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1178 ; EG-NEXT: ALU clause starting at 9:
1179 ; EG-NEXT: MOV * T0.Y, 0.0,
1180 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1181 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1183 ; GCN-HSA-LABEL: global_zextload_i32_to_i64:
1185 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1186 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1187 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1188 ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3]
1189 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1190 ; GCN-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
1191 ; GCN-HSA-NEXT: s_endpgm
1192 %ld = load i32, ptr addrspace(1) %in
1193 %ext = zext i32 %ld to i64
1194 store i64 %ext, ptr addrspace(1) %out
1198 define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1199 ; SI-NOHSA-LABEL: global_sextload_i32_to_i64:
1200 ; SI-NOHSA: ; %bb.0:
1201 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1202 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1203 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1204 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1205 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1206 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1207 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1208 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1209 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1210 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1211 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1212 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1213 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1214 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1215 ; SI-NOHSA-NEXT: s_endpgm
1217 ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
1218 ; GCNX3-HSA: ; %bb.0:
1219 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1220 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1221 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1222 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1223 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1224 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1225 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1226 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1227 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1228 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1229 ; GCNX3-HSA-NEXT: s_endpgm
1231 ; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
1232 ; GCNX3-NOHSA: ; %bb.0:
1233 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1234 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1235 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1236 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1237 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1238 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1239 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1240 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1241 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1242 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1243 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1244 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1245 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1246 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1247 ; GCNX3-NOHSA-NEXT: s_endpgm
1249 ; EG-LABEL: global_sextload_i32_to_i64:
1251 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1253 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1254 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1257 ; EG-NEXT: Fetch clause starting at 6:
1258 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1259 ; EG-NEXT: ALU clause starting at 8:
1260 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1261 ; EG-NEXT: ALU clause starting at 9:
1262 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1263 ; EG-NEXT: ASHR * T0.Y, T0.X, literal.y,
1264 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
1266 ; GCN-HSA-LABEL: global_sextload_i32_to_i64:
1268 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1269 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
1270 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1271 ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3]
1272 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1273 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1274 ; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1275 ; GCN-HSA-NEXT: s_endpgm
1276 %ld = load i32, ptr addrspace(1) %in
1277 %ext = sext i32 %ld to i64
1278 store i64 %ext, ptr addrspace(1) %out
1282 define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1283 ; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
1284 ; SI-NOHSA: ; %bb.0:
1285 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1286 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1287 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1288 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1289 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1290 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1291 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1292 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1293 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1294 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1295 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1296 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1297 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1298 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1299 ; SI-NOHSA-NEXT: s_endpgm
1301 ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
1302 ; GCNX3-HSA: ; %bb.0:
1303 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1304 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1305 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1306 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1307 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1308 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1309 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1310 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
1311 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1312 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1313 ; GCNX3-HSA-NEXT: s_endpgm
1315 ; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
1316 ; GCNX3-NOHSA: ; %bb.0:
1317 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1318 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1319 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1320 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1321 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1322 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1323 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1324 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1325 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1326 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1327 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1328 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1329 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1330 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1331 ; GCNX3-NOHSA-NEXT: s_endpgm
1333 ; EG-LABEL: global_zextload_v1i32_to_v1i64:
1335 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1337 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1338 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1341 ; EG-NEXT: Fetch clause starting at 6:
1342 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1343 ; EG-NEXT: ALU clause starting at 8:
1344 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1345 ; EG-NEXT: ALU clause starting at 9:
1346 ; EG-NEXT: MOV * T0.Y, 0.0,
1347 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1348 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1350 ; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64:
1352 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1353 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1354 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1355 ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3]
1356 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1357 ; GCN-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
1358 ; GCN-HSA-NEXT: s_endpgm
1359 %ld = load <1 x i32>, ptr addrspace(1) %in
1360 %ext = zext <1 x i32> %ld to <1 x i64>
1361 store <1 x i64> %ext, ptr addrspace(1) %out
1365 define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1366 ; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
1367 ; SI-NOHSA: ; %bb.0:
1368 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1369 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1370 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1371 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1372 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1373 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1374 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1375 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1376 ; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1377 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1378 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1379 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1380 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1381 ; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1382 ; SI-NOHSA-NEXT: s_endpgm
1384 ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
1385 ; GCNX3-HSA: ; %bb.0:
1386 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1387 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1388 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1389 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1390 ; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1]
1391 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0
1392 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1
1393 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1394 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1395 ; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1396 ; GCNX3-HSA-NEXT: s_endpgm
1398 ; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
1399 ; GCNX3-NOHSA: ; %bb.0:
1400 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1401 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1402 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1403 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1404 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1405 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1406 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1407 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1408 ; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
1409 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1410 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1411 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1412 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1413 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1414 ; GCNX3-NOHSA-NEXT: s_endpgm
1416 ; EG-LABEL: global_sextload_v1i32_to_v1i64:
1418 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1420 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1421 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1424 ; EG-NEXT: Fetch clause starting at 6:
1425 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1426 ; EG-NEXT: ALU clause starting at 8:
1427 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1428 ; EG-NEXT: ALU clause starting at 9:
1429 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1430 ; EG-NEXT: ASHR * T0.Y, T0.X, literal.y,
1431 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
1433 ; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64:
1435 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1436 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0
1437 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1438 ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3]
1439 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1440 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1441 ; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1442 ; GCN-HSA-NEXT: s_endpgm
1443 %ld = load <1 x i32>, ptr addrspace(1) %in
1444 %ext = sext <1 x i32> %ld to <1 x i64>
1445 store <1 x i64> %ext, ptr addrspace(1) %out
1449 define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1450 ; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
1451 ; SI-NOHSA: ; %bb.0:
1452 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1453 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1454 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1455 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1456 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1457 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1458 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1459 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1460 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1461 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1462 ; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1
1463 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1464 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1465 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1466 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
1467 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
1468 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1469 ; SI-NOHSA-NEXT: s_endpgm
1471 ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
1472 ; GCNX3-HSA: ; %bb.0:
1473 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1474 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1475 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1476 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1477 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1478 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
1479 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
1480 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
1481 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1482 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v2
1483 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v3
1484 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1
1485 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1486 ; GCNX3-HSA-NEXT: s_endpgm
1488 ; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
1489 ; GCNX3-NOHSA: ; %bb.0:
1490 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1491 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1492 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1493 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1494 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1495 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1496 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1497 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1498 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1499 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
1500 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1501 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1502 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1503 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v2
1504 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v3
1505 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v1
1506 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1507 ; GCNX3-NOHSA-NEXT: s_endpgm
1509 ; EG-LABEL: global_zextload_v2i32_to_v2i64:
1511 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1513 ; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
1514 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1517 ; EG-NEXT: Fetch clause starting at 6:
1518 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1519 ; EG-NEXT: ALU clause starting at 8:
1520 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1521 ; EG-NEXT: ALU clause starting at 9:
1522 ; EG-NEXT: MOV T1.X, T0.X,
1523 ; EG-NEXT: MOV T1.Y, 0.0,
1524 ; EG-NEXT: MOV T1.Z, T0.Y,
1525 ; EG-NEXT: MOV T1.W, 0.0,
1526 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1527 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1529 ; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64:
1531 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1532 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1533 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1534 ; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
1535 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1536 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v2
1537 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v3
1538 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
1539 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
1540 ; GCN-HSA-NEXT: s_endpgm
1541 %ld = load <2 x i32>, ptr addrspace(1) %in
1542 %ext = zext <2 x i32> %ld to <2 x i64>
1543 store <2 x i64> %ext, ptr addrspace(1) %out
1547 define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1548 ; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
1549 ; SI-NOHSA: ; %bb.0:
1550 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1551 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1552 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1553 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1554 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1555 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1556 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1557 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1558 ; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1559 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1560 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1561 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1562 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
1563 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v1
1564 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1565 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1566 ; SI-NOHSA-NEXT: s_endpgm
1568 ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
1569 ; GCNX3-HSA: ; %bb.0:
1570 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1571 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1572 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1573 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1574 ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1575 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
1576 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
1577 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1578 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
1579 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v1
1580 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1581 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1582 ; GCNX3-HSA-NEXT: s_endpgm
1584 ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
1585 ; GCNX3-NOHSA: ; %bb.0:
1586 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1587 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1588 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1589 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1590 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1591 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1592 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1593 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1594 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1595 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1596 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1597 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1598 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
1599 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v1
1600 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1601 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1602 ; GCNX3-NOHSA-NEXT: s_endpgm
1604 ; EG-LABEL: global_sextload_v2i32_to_v2i64:
1606 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1608 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
1609 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1612 ; EG-NEXT: Fetch clause starting at 6:
1613 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1614 ; EG-NEXT: ALU clause starting at 8:
1615 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1616 ; EG-NEXT: ALU clause starting at 9:
1617 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.x,
1618 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1619 ; EG-NEXT: ASHR * T1.Y, T0.X, literal.x,
1620 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1621 ; EG-NEXT: MOV T1.X, T0.X,
1622 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1623 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1624 ; EG-NEXT: MOV * T1.Z, T0.Y,
1626 ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64:
1628 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1629 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
1630 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1631 ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1632 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1633 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
1634 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v1
1635 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1636 ; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1637 ; GCN-HSA-NEXT: s_endpgm
1638 %ld = load <2 x i32>, ptr addrspace(1) %in
1639 %ext = sext <2 x i32> %ld to <2 x i64>
1640 store <2 x i64> %ext, ptr addrspace(1) %out
1644 define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1645 ; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
1646 ; SI-NOHSA: ; %bb.0:
1647 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1648 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1649 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1650 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1651 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1652 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1653 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1654 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1655 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1656 ; SI-NOHSA-NEXT: v_mov_b32_e32 v5, 0
1657 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5
1658 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1659 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1660 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1661 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
1662 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
1663 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1664 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
1665 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0
1666 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1
1667 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1668 ; SI-NOHSA-NEXT: s_endpgm
1670 ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
1671 ; GCNX3-HSA: ; %bb.0:
1672 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1673 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0
1674 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
1675 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1676 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1677 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1678 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1679 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
1680 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1681 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
1682 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
1683 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1684 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
1685 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3
1686 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
1687 ; GCNX3-HSA-NEXT: s_nop 0
1688 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v0
1689 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v1
1690 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
1691 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
1692 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
1693 ; GCNX3-HSA-NEXT: s_endpgm
1695 ; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
1696 ; GCNX3-NOHSA: ; %bb.0:
1697 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1698 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1699 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1700 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1701 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1702 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1703 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1704 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1705 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1706 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, 0
1707 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v5
1708 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1709 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1710 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1711 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
1712 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
1713 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1714 ; GCNX3-NOHSA-NEXT: s_nop 0
1715 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
1716 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
1717 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1718 ; GCNX3-NOHSA-NEXT: s_endpgm
1720 ; EG-LABEL: global_zextload_v4i32_to_v4i64:
1722 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1724 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1725 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
1726 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
1728 ; EG-NEXT: Fetch clause starting at 6:
1729 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1730 ; EG-NEXT: ALU clause starting at 8:
1731 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1732 ; EG-NEXT: ALU clause starting at 9:
1733 ; EG-NEXT: MOV T1.X, T0.Z,
1734 ; EG-NEXT: MOV T1.Y, 0.0,
1735 ; EG-NEXT: MOV * T2.X, T0.X,
1736 ; EG-NEXT: MOV T2.Y, 0.0,
1737 ; EG-NEXT: MOV T1.Z, T0.W,
1738 ; EG-NEXT: MOV T1.W, 0.0,
1739 ; EG-NEXT: MOV * T2.Z, T0.Y,
1740 ; EG-NEXT: MOV * T2.W, 0.0,
1741 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1742 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
1743 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1744 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
1745 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1747 ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64:
1749 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1750 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
1751 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
1752 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1753 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3]
1754 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1755 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
1756 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
1757 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
1758 ; GCN-HSA-NEXT: s_nop 0
1759 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
1760 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
1761 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
1762 ; GCN-HSA-NEXT: s_endpgm
1763 %ld = load <4 x i32>, ptr addrspace(1) %in
1764 %ext = zext <4 x i32> %ld to <4 x i64>
1765 store <4 x i64> %ext, ptr addrspace(1) %out
1769 define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1770 ; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
1771 ; SI-NOHSA: ; %bb.0:
1772 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1773 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1774 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1775 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1776 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1777 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1778 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1779 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1780 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1781 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1782 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1783 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
1784 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1785 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1786 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1787 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1788 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v2
1789 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v3
1790 ; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v0
1791 ; SI-NOHSA-NEXT: v_mov_b32_e32 v5, v1
1792 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
1793 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
1794 ; SI-NOHSA-NEXT: s_endpgm
1796 ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
1797 ; GCNX3-HSA: ; %bb.0:
1798 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1799 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1800 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1801 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1802 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1803 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
1804 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1805 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3
1806 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
1807 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2
1808 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0
1809 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
1810 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1811 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1812 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v2
1813 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v3
1814 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1815 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1816 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v0
1817 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, v1
1818 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
1819 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[3:6]
1820 ; GCNX3-HSA-NEXT: s_endpgm
1822 ; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
1823 ; GCNX3-NOHSA: ; %bb.0:
1824 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1825 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1826 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1827 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1828 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1829 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1830 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1831 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1832 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1833 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1834 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1835 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
1836 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1837 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1838 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v2
1839 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v3
1840 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1841 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1842 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v0
1843 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v1
1844 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
1845 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
1846 ; GCNX3-NOHSA-NEXT: s_endpgm
1848 ; EG-LABEL: global_sextload_v4i32_to_v4i64:
1850 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1852 ; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
1853 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
1854 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
1856 ; EG-NEXT: Fetch clause starting at 6:
1857 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1858 ; EG-NEXT: ALU clause starting at 8:
1859 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1860 ; EG-NEXT: ALU clause starting at 9:
1861 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.x,
1862 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1863 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
1864 ; EG-NEXT: ASHR T1.Y, T0.X, literal.y,
1865 ; EG-NEXT: ASHR T3.W, T0.W, literal.y,
1866 ; EG-NEXT: MOV * T1.X, T0.X,
1867 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
1868 ; EG-NEXT: ASHR * T3.Y, T0.Z, literal.x,
1869 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1870 ; EG-NEXT: MOV T3.X, T0.Z,
1871 ; EG-NEXT: MOV T1.Z, T0.Y,
1872 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
1873 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1874 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
1875 ; EG-NEXT: MOV * T3.Z, T0.W,
1876 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1878 ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64:
1880 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1881 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0
1882 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
1883 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3]
1884 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
1885 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1886 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
1887 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v2
1888 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v3
1889 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1890 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1891 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v0
1892 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
1893 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[7:10], s[0:1] offset:16
1894 ; GCN-HSA-NEXT: global_store_dwordx4 v11, v[3:6], s[0:1]
1895 ; GCN-HSA-NEXT: s_endpgm
1896 %ld = load <4 x i32>, ptr addrspace(1) %in
1897 %ext = sext <4 x i32> %ld to <4 x i64>
1898 store <4 x i64> %ext, ptr addrspace(1) %out
1902 define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1903 ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
1904 ; SI-NOHSA: ; %bb.0:
1905 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1906 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1907 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
1908 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
1909 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
1910 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1911 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
1912 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
1913 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
1914 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
1915 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, 0
1916 ; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v9
1917 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
1918 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
1919 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
1920 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v2
1921 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v3
1922 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
1923 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
1924 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v0
1925 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v1
1926 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
1927 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) expcnt(0)
1928 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v6
1929 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v7
1930 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
1931 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
1932 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v4
1933 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v5
1934 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
1935 ; SI-NOHSA-NEXT: s_endpgm
1937 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
1938 ; GCNX3-HSA: ; %bb.0:
1939 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1940 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0
1941 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
1942 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
1943 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
1944 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
1945 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
1946 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
1947 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1948 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
1949 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
1950 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1951 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
1952 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1953 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
1954 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
1955 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
1956 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
1957 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
1958 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
1959 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
1960 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
1961 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
1962 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
1963 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
1964 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v2
1965 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v3
1966 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
1967 ; GCNX3-HSA-NEXT: s_nop 0
1968 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
1969 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
1970 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
1971 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
1972 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
1973 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
1974 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
1975 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
1976 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
1977 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4
1978 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5
1979 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
1980 ; GCNX3-HSA-NEXT: s_endpgm
1982 ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
1983 ; GCNX3-NOHSA: ; %bb.0:
1984 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1985 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
1986 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
1987 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
1988 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
1989 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
1990 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
1991 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
1992 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
1993 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
1994 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0
1995 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9
1996 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
1997 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
1998 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
1999 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2
2000 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3
2001 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
2002 ; GCNX3-NOHSA-NEXT: s_nop 0
2003 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0
2004 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1
2005 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
2006 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
2007 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
2008 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
2009 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
2010 ; GCNX3-NOHSA-NEXT: s_nop 0
2011 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
2012 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
2013 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
2014 ; GCNX3-NOHSA-NEXT: s_endpgm
2016 ; EG-LABEL: global_zextload_v8i32_to_v8i64:
2018 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
2020 ; EG-NEXT: ALU 26, @13, KC0[CB0:0-32], KC1[]
2021 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
2022 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
2023 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
2024 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 1
2026 ; EG-NEXT: Fetch clause starting at 8:
2027 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
2028 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2029 ; EG-NEXT: ALU clause starting at 12:
2030 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2031 ; EG-NEXT: ALU clause starting at 13:
2032 ; EG-NEXT: MOV T2.X, T1.Z,
2033 ; EG-NEXT: MOV T2.Y, 0.0,
2034 ; EG-NEXT: MOV * T3.X, T1.X,
2035 ; EG-NEXT: MOV * T3.Y, 0.0,
2036 ; EG-NEXT: MOV T4.X, T0.Z,
2037 ; EG-NEXT: MOV T4.Y, 0.0,
2038 ; EG-NEXT: MOV * T5.X, T0.X,
2039 ; EG-NEXT: MOV T5.Y, 0.0,
2040 ; EG-NEXT: MOV T2.Z, T1.W,
2041 ; EG-NEXT: MOV T2.W, 0.0,
2042 ; EG-NEXT: MOV * T3.Z, T1.Y,
2043 ; EG-NEXT: MOV * T3.W, 0.0,
2044 ; EG-NEXT: MOV T4.Z, T0.W,
2045 ; EG-NEXT: MOV T4.W, 0.0,
2046 ; EG-NEXT: MOV * T5.Z, T0.Y,
2047 ; EG-NEXT: MOV * T5.W, 0.0,
2048 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2049 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2050 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2051 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
2052 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2053 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2054 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
2055 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2056 ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
2057 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
2058 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2060 ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64:
2062 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2063 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
2064 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
2065 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
2066 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:16
2067 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3]
2068 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
2069 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
2070 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
2071 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
2072 ; GCN-HSA-NEXT: s_nop 0
2073 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
2074 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
2075 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
2076 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
2077 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
2078 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
2079 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
2080 ; GCN-HSA-NEXT: s_nop 0
2081 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8
2082 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9
2083 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
2084 ; GCN-HSA-NEXT: s_endpgm
2085 %ld = load <8 x i32>, ptr addrspace(1) %in
2086 %ext = zext <8 x i32> %ld to <8 x i64>
2087 store <8 x i64> %ext, ptr addrspace(1) %out
2091 define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2092 ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
2093 ; SI-NOHSA: ; %bb.0:
2094 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2095 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
2096 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
2097 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
2098 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
2099 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2100 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
2101 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
2102 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2103 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
2104 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
2105 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2106 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
2107 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
2108 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
2109 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3
2110 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
2111 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
2112 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5
2113 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4
2114 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2115 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6
2116 ; SI-NOHSA-NEXT: v_mov_b32_e32 v19, v6
2117 ; SI-NOHSA-NEXT: v_mov_b32_e32 v21, v7
2118 ; SI-NOHSA-NEXT: v_mov_b32_e32 v15, v4
2119 ; SI-NOHSA-NEXT: v_mov_b32_e32 v17, v5
2120 ; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v2
2121 ; SI-NOHSA-NEXT: v_mov_b32_e32 v13, v3
2122 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v0
2123 ; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v1
2124 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:48
2125 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:32
2126 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16
2127 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0
2128 ; SI-NOHSA-NEXT: s_endpgm
2130 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
2131 ; GCNX3-HSA: ; %bb.0:
2132 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2133 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
2134 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2135 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2136 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
2137 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
2138 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
2139 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2140 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
2141 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2142 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
2143 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2144 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3
2145 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2
2146 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
2147 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
2148 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2149 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
2150 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
2151 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
2152 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
2153 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
2154 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
2155 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0
2156 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
2157 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
2158 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
2159 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3
2160 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v2
2161 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v2
2162 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v3
2163 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
2164 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
2165 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
2166 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
2167 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
2168 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5
2169 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
2170 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
2171 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
2172 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
2173 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4
2174 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4
2175 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5
2176 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
2177 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
2178 ; GCNX3-HSA-NEXT: s_endpgm
2180 ; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
2181 ; GCNX3-NOHSA: ; %bb.0:
2182 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
2183 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2184 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
2185 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
2186 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
2187 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2188 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
2189 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
2190 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2191 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2192 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2193 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2194 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
2195 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
2196 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
2197 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2198 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6
2199 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v6
2200 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v7
2201 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
2202 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3
2203 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
2204 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5
2205 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4
2206 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v15, v4
2207 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, v5
2208 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v2
2209 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v13, v3
2210 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v0
2211 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v1
2212 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
2213 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
2214 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
2215 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
2216 ; GCNX3-NOHSA-NEXT: s_endpgm
2218 ; EG-LABEL: global_sextload_v8i32_to_v8i64:
2220 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
2222 ; EG-NEXT: ALU 31, @13, KC0[CB0:0-32], KC1[]
2223 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0
2224 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
2225 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0
2226 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 1
2228 ; EG-NEXT: Fetch clause starting at 8:
2229 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
2230 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2231 ; EG-NEXT: ALU clause starting at 12:
2232 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2233 ; EG-NEXT: ALU clause starting at 13:
2234 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2235 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
2236 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2237 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2238 ; EG-NEXT: ADD_INT T2.W, KC0[2].Y, literal.y,
2239 ; EG-NEXT: ASHR * T4.W, T0.Y, literal.z,
2240 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2241 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2242 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
2243 ; EG-NEXT: ASHR T4.Y, T0.X, literal.y,
2244 ; EG-NEXT: ASHR T6.W, T0.W, literal.y,
2245 ; EG-NEXT: MOV * T4.X, T0.X,
2246 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
2247 ; EG-NEXT: ASHR T6.Y, T0.Z, literal.x,
2248 ; EG-NEXT: ASHR * T7.W, T1.Y, literal.x,
2249 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2250 ; EG-NEXT: MOV T6.X, T0.Z,
2251 ; EG-NEXT: ASHR T7.Y, T1.X, literal.x,
2252 ; EG-NEXT: MOV T4.Z, T0.Y,
2253 ; EG-NEXT: ASHR T8.W, T1.W, literal.x,
2254 ; EG-NEXT: MOV * T7.X, T1.X,
2255 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2256 ; EG-NEXT: ASHR T8.Y, T1.Z, literal.x,
2257 ; EG-NEXT: MOV * T6.Z, T0.W,
2258 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2259 ; EG-NEXT: MOV T8.X, T1.Z,
2260 ; EG-NEXT: MOV T7.Z, T1.Y,
2261 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
2262 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
2263 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
2264 ; EG-NEXT: MOV * T8.Z, T1.W,
2265 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2267 ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64:
2269 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2270 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0
2271 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
2272 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3]
2273 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v23, s[2:3] offset:16
2274 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
2275 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
2276 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
2277 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7
2278 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6
2279 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v6
2280 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v7
2281 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
2282 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3
2283 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
2284 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5
2285 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4
2286 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v4
2287 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v5
2288 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v2
2289 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v3
2290 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v0
2291 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v1
2292 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[19:22], s[0:1] offset:48
2293 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[15:18], s[0:1] offset:32
2294 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[11:14], s[0:1] offset:16
2295 ; GCN-HSA-NEXT: global_store_dwordx4 v23, v[7:10], s[0:1]
2296 ; GCN-HSA-NEXT: s_endpgm
2297 %ld = load <8 x i32>, ptr addrspace(1) %in
2298 %ext = sext <8 x i32> %ld to <8 x i64>
2299 store <8 x i64> %ext, ptr addrspace(1) %out
2303 define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2304 ; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
2305 ; SI-NOHSA: ; %bb.0:
2306 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
2307 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2308 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
2309 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
2310 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
2311 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2312 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
2313 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
2314 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2315 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
2316 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2317 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2318 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
2319 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
2320 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
2321 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
2322 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v0
2323 ; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v0
2324 ; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v1
2325 ; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v2
2326 ; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v3
2327 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
2328 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
2329 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6
2330 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
2331 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
2332 ; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v4
2333 ; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5
2334 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
2335 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
2336 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
2337 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
2338 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
2339 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
2340 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
2341 ; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
2342 ; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
2343 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10
2344 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11
2345 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
2346 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2347 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2348 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
2349 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
2350 ; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12
2351 ; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13
2352 ; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14
2353 ; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15
2354 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
2355 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
2356 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
2357 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
2358 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
2359 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
2360 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
2361 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
2362 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
2363 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2364 ; SI-NOHSA-NEXT: s_endpgm
2366 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2367 ; GCNX3-HSA: ; %bb.0:
2368 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2369 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
2370 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2371 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2372 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
2373 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
2374 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
2375 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
2376 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
2377 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
2378 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
2379 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
2380 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
2381 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
2382 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
2383 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
2384 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
2385 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2386 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2387 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2388 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
2389 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2390 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
2391 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
2392 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
2393 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2394 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
2395 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
2396 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
2397 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2398 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
2399 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
2400 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
2401 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
2402 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
2403 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2404 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
2405 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9
2406 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8
2407 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
2408 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
2409 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
2410 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
2411 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
2412 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
2413 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2414 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
2415 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
2416 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
2417 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11
2418 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10
2419 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10
2420 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11
2421 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2422 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
2423 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
2424 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2425 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2426 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
2427 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12
2428 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
2429 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
2430 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14
2431 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15
2432 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
2433 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
2434 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
2435 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
2436 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
2437 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
2438 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
2439 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
2440 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
2441 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
2442 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
2443 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
2444 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
2445 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
2446 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
2447 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1
2448 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
2449 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
2450 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0
2451 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
2452 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
2453 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
2454 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
2455 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
2456 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
2457 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
2458 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
2459 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3
2460 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
2461 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7]
2462 ; GCNX3-HSA-NEXT: s_endpgm
2464 ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
2465 ; GCNX3-NOHSA: ; %bb.0:
2466 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
2467 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2468 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
2469 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
2470 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
2471 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2472 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
2473 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
2474 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
2475 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
2476 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2477 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2478 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2479 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2480 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
2481 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
2482 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
2483 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
2484 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
2485 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v4
2486 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v5
2487 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
2488 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
2489 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
2490 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v6
2491 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v7
2492 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
2493 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
2494 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
2495 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
2496 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2
2497 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3
2498 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
2499 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
2500 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
2501 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
2502 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
2503 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8
2504 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9
2505 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v10
2506 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v11
2507 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
2508 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2509 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2510 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
2511 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
2512 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v12
2513 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v13
2514 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v14
2515 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v15
2516 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
2517 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
2518 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2519 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
2520 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
2521 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
2522 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
2523 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2524 ; GCNX3-NOHSA-NEXT: s_endpgm
2526 ; EG-LABEL: global_sextload_v16i32_to_v16i64:
2528 ; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
2529 ; EG-NEXT: TEX 3 @12
2530 ; EG-NEXT: ALU 64, @21, KC0[CB0:0-32], KC1[]
2531 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0
2532 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0
2533 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0
2534 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 0
2535 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
2536 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T6.X, 0
2537 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T5.X, 0
2538 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T4.X, 1
2540 ; EG-NEXT: Fetch clause starting at 12:
2541 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
2542 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
2543 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1
2544 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2545 ; EG-NEXT: ALU clause starting at 20:
2546 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2547 ; EG-NEXT: ALU clause starting at 21:
2548 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x,
2549 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2550 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
2551 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
2552 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2553 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x,
2554 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
2555 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
2556 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
2557 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2558 ; EG-NEXT: LSHR T7.X, PV.W, literal.x,
2559 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
2560 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
2561 ; EG-NEXT: LSHR T8.X, PV.W, literal.x,
2562 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
2563 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
2564 ; EG-NEXT: LSHR T9.X, PV.W, literal.x,
2565 ; EG-NEXT: ADD_INT T4.W, KC0[2].Y, literal.y,
2566 ; EG-NEXT: ASHR * T10.W, T0.W, literal.z,
2567 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
2568 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2569 ; EG-NEXT: LSHR T11.X, PV.W, literal.x,
2570 ; EG-NEXT: ASHR T10.Y, T0.Z, literal.y,
2571 ; EG-NEXT: ASHR T12.W, T0.Y, literal.y,
2572 ; EG-NEXT: MOV * T10.X, T0.Z,
2573 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
2574 ; EG-NEXT: ASHR T12.Y, T0.X, literal.x,
2575 ; EG-NEXT: ASHR * T13.W, T3.W, literal.x,
2576 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2577 ; EG-NEXT: MOV T12.X, T0.X,
2578 ; EG-NEXT: ASHR T13.Y, T3.Z, literal.x,
2579 ; EG-NEXT: MOV T10.Z, T0.W,
2580 ; EG-NEXT: ASHR T14.W, T3.Y, literal.x,
2581 ; EG-NEXT: MOV * T13.X, T3.Z,
2582 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2583 ; EG-NEXT: ASHR T14.Y, T3.X, literal.x,
2584 ; EG-NEXT: MOV T12.Z, T0.Y,
2585 ; EG-NEXT: ASHR * T0.W, T2.W, literal.x,
2586 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2587 ; EG-NEXT: MOV T14.X, T3.X,
2588 ; EG-NEXT: ASHR T0.Y, T2.Z, literal.x,
2589 ; EG-NEXT: MOV T13.Z, T3.W,
2590 ; EG-NEXT: ASHR T15.W, T2.Y, literal.x,
2591 ; EG-NEXT: MOV * T0.X, T2.Z,
2592 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2593 ; EG-NEXT: ASHR T15.Y, T2.X, literal.x,
2594 ; EG-NEXT: MOV T14.Z, T3.Y,
2595 ; EG-NEXT: ASHR * T3.W, T1.W, literal.x,
2596 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2597 ; EG-NEXT: MOV T15.X, T2.X,
2598 ; EG-NEXT: ASHR T3.Y, T1.Z, literal.x,
2599 ; EG-NEXT: MOV T0.Z, T2.W,
2600 ; EG-NEXT: ASHR T16.W, T1.Y, literal.x,
2601 ; EG-NEXT: MOV * T3.X, T1.Z,
2602 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2603 ; EG-NEXT: ASHR T16.Y, T1.X, literal.x,
2604 ; EG-NEXT: MOV * T15.Z, T2.Y,
2605 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
2606 ; EG-NEXT: MOV T16.X, T1.X,
2607 ; EG-NEXT: MOV T3.Z, T1.W,
2608 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2609 ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
2610 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
2611 ; EG-NEXT: MOV * T16.Z, T1.Y,
2612 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2614 ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2616 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2617 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0
2618 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
2619 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
2620 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
2621 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
2622 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3]
2623 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
2624 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
2625 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
2626 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
2627 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
2628 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4
2629 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5
2630 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
2631 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
2632 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
2633 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6
2634 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7
2635 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
2636 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
2637 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0
2638 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1
2639 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2
2640 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3
2641 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
2642 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
2643 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
2644 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
2645 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
2646 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8
2647 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9
2648 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
2649 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
2650 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
2651 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
2652 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
2653 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
2654 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
2655 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12
2656 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13
2657 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14
2658 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15
2659 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
2660 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
2661 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
2662 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
2663 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
2664 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
2665 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1]
2666 ; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
2667 ; GCN-HSA-NEXT: s_endpgm
2668 %ld = load <16 x i32>, ptr addrspace(1) %in
2669 %ext = sext <16 x i32> %ld to <16 x i64>
2670 store <16 x i64> %ext, ptr addrspace(1) %out
2674 define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2675 ; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
2676 ; SI-NOHSA: ; %bb.0:
2677 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
2678 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2679 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
2680 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
2681 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
2682 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2683 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
2684 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
2685 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2686 ; SI-NOHSA-NEXT: v_mov_b32_e32 v5, 0
2687 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
2688 ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5
2689 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
2690 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
2691 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2692 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
2693 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
2694 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0
2695 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1
2696 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
2697 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2698 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
2699 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
2700 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
2701 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
2702 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v8
2703 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v9
2704 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2705 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2706 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10
2707 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11
2708 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
2709 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
2710 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v16
2711 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v17
2712 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
2713 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2714 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v18
2715 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v19
2716 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
2717 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2718 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v12
2719 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v13
2720 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
2721 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
2722 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v14
2723 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v15
2724 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2725 ; SI-NOHSA-NEXT: s_endpgm
2727 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
2728 ; GCNX3-HSA: ; %bb.0:
2729 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2730 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0
2731 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
2732 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
2733 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
2734 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
2735 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
2736 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2737 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
2738 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2739 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 48
2740 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
2741 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2742 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
2743 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
2744 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2745 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
2746 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
2747 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
2748 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
2749 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
2750 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
2751 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
2752 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2753 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
2754 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
2755 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
2756 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2757 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
2758 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
2759 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
2760 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2761 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
2762 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
2763 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
2764 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
2765 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
2766 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2767 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
2768 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0
2769 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1
2770 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
2771 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
2772 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
2773 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
2774 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2
2775 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3
2776 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2777 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2
2778 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
2779 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3
2780 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
2781 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4
2782 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5
2783 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
2784 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
2785 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
2786 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6
2787 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7
2788 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19]
2789 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
2790 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
2791 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
2792 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
2793 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
2794 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
2795 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
2796 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10
2797 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11
2798 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
2799 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
2800 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
2801 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
2802 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
2803 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
2804 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
2805 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
2806 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14
2807 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15
2808 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
2809 ; GCNX3-HSA-NEXT: s_endpgm
2811 ; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
2812 ; GCNX3-NOHSA: ; %bb.0:
2813 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
2814 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
2815 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
2816 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
2817 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
2818 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
2819 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
2820 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
2821 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2822 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
2823 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2824 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2825 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, 0
2826 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v17
2827 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
2828 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
2829 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
2830 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v0
2831 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v1
2832 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
2833 ; GCNX3-NOHSA-NEXT: s_nop 0
2834 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2
2835 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3
2836 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
2837 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4)
2838 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v4
2839 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v5
2840 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
2841 ; GCNX3-NOHSA-NEXT: s_nop 0
2842 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v6
2843 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v7
2844 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
2845 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5)
2846 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v8
2847 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v9
2848 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
2849 ; GCNX3-NOHSA-NEXT: s_nop 0
2850 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v10
2851 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v11
2852 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
2853 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
2854 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v12
2855 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v13
2856 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
2857 ; GCNX3-NOHSA-NEXT: s_nop 0
2858 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v14
2859 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v15
2860 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
2861 ; GCNX3-NOHSA-NEXT: s_endpgm
2863 ; EG-LABEL: global_zextload_v16i32_to_v16i64:
2865 ; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
2866 ; EG-NEXT: TEX 3 @12
2867 ; EG-NEXT: ALU 55, @21, KC0[CB0:0-32], KC1[]
2868 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0
2869 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0
2870 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0
2871 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
2872 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T3.X, 0
2873 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T2.X, 0
2874 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T1.X, 0
2875 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T0.X, 1
2877 ; EG-NEXT: Fetch clause starting at 12:
2878 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
2879 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
2880 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1
2881 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1
2882 ; EG-NEXT: ALU clause starting at 20:
2883 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2884 ; EG-NEXT: ALU clause starting at 21:
2885 ; EG-NEXT: MOV T4.X, T1.X,
2886 ; EG-NEXT: MOV T4.Y, 0.0,
2887 ; EG-NEXT: MOV * T5.X, T1.Z,
2888 ; EG-NEXT: MOV * T5.Y, 0.0,
2889 ; EG-NEXT: MOV T6.X, T0.X,
2890 ; EG-NEXT: MOV T6.Y, 0.0,
2891 ; EG-NEXT: MOV * T7.X, T0.Z,
2892 ; EG-NEXT: MOV * T7.Y, 0.0,
2893 ; EG-NEXT: MOV T8.X, T3.X,
2894 ; EG-NEXT: MOV T8.Y, 0.0,
2895 ; EG-NEXT: MOV * T9.X, T3.Z,
2896 ; EG-NEXT: MOV * T9.Y, 0.0,
2897 ; EG-NEXT: MOV T10.X, T2.X,
2898 ; EG-NEXT: MOV T10.Y, 0.0,
2899 ; EG-NEXT: MOV * T11.X, T2.Z,
2900 ; EG-NEXT: MOV T11.Y, 0.0,
2901 ; EG-NEXT: MOV T4.Z, T1.Y,
2902 ; EG-NEXT: MOV T4.W, 0.0,
2903 ; EG-NEXT: MOV * T5.Z, T1.W,
2904 ; EG-NEXT: MOV * T5.W, 0.0,
2905 ; EG-NEXT: MOV T6.Z, T0.Y,
2906 ; EG-NEXT: MOV T6.W, 0.0,
2907 ; EG-NEXT: MOV * T7.Z, T0.W,
2908 ; EG-NEXT: MOV * T7.W, 0.0,
2909 ; EG-NEXT: MOV T8.Z, T3.Y,
2910 ; EG-NEXT: MOV T8.W, 0.0,
2911 ; EG-NEXT: MOV * T9.Z, T3.W,
2912 ; EG-NEXT: MOV * T9.W, 0.0,
2913 ; EG-NEXT: MOV T10.Z, T2.Y,
2914 ; EG-NEXT: MOV T10.W, 0.0,
2915 ; EG-NEXT: MOV * T11.Z, T2.W,
2916 ; EG-NEXT: MOV T11.W, 0.0,
2917 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
2918 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2919 ; EG-NEXT: LSHR T0.X, PS, literal.x,
2920 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2921 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2922 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
2923 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
2924 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
2925 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2926 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2927 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2928 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2929 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
2930 ; EG-NEXT: LSHR T12.X, PV.W, literal.x,
2931 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2932 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
2933 ; EG-NEXT: LSHR T13.X, PV.W, literal.x,
2934 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2935 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
2936 ; EG-NEXT: LSHR T14.X, PV.W, literal.x,
2937 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
2938 ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
2939 ; EG-NEXT: LSHR * T15.X, PV.W, literal.x,
2940 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2942 ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64:
2944 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2945 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
2946 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
2947 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
2948 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:48
2949 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:32
2950 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:16
2951 ; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3]
2952 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
2953 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
2954 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
2955 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
2956 ; GCN-HSA-NEXT: s_nop 0
2957 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
2958 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
2959 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
2960 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
2961 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8
2962 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9
2963 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
2964 ; GCN-HSA-NEXT: s_nop 0
2965 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
2966 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
2967 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
2968 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
2969 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v12
2970 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v13
2971 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
2972 ; GCN-HSA-NEXT: s_nop 0
2973 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v14
2974 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v15
2975 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
2976 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
2977 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v16
2978 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v17
2979 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
2980 ; GCN-HSA-NEXT: s_nop 0
2981 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v18
2982 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v19
2983 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
2984 ; GCN-HSA-NEXT: s_endpgm
2985 %ld = load <16 x i32>, ptr addrspace(1) %in
2986 %ext = zext <16 x i32> %ld to <16 x i64>
2987 store <16 x i64> %ext, ptr addrspace(1) %out
2991 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2992 ; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
2993 ; SI-NOHSA: ; %bb.0:
2994 ; SI-NOHSA-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2995 ; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2996 ; SI-NOHSA-NEXT: s_mov_b32 s14, -1
2997 ; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000
2998 ; SI-NOHSA-NEXT: s_add_u32 s12, s12, s11
2999 ; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0
3000 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
3001 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
3002 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
3003 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
3004 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
3005 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
3006 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
3007 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
3008 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
3009 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
3010 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
3011 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
3012 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
3013 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3014 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
3015 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
3016 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
3017 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31
3018 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30
3019 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
3020 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
3021 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
3022 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13
3023 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12
3024 ; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v12
3025 ; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13
3026 ; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14
3027 ; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15
3028 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
3029 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
3030 ; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28
3031 ; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29
3032 ; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30
3033 ; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31
3034 ; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
3035 ; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
3036 ; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
3037 ; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
3038 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(9)
3039 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7
3040 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
3041 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3042 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5
3043 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4
3044 ; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v4
3045 ; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5
3046 ; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6
3047 ; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7
3048 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8)
3049 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
3050 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
3051 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
3052 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0
3053 ; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v0
3054 ; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1
3055 ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
3056 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
3057 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
3058 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
3059 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
3060 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
3061 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
3062 ; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16
3063 ; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17
3064 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18
3065 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19
3066 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
3067 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
3068 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
3069 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
3070 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
3071 ; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20
3072 ; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21
3073 ; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22
3074 ; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23
3075 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
3076 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27
3077 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26
3078 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25
3079 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24
3080 ; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24
3081 ; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25
3082 ; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26
3083 ; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27
3084 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
3085 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11
3086 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10
3087 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
3088 ; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
3089 ; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
3090 ; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
3091 ; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10
3092 ; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11
3093 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
3094 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
3095 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
3096 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
3097 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
3098 ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
3099 ; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
3100 ; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
3101 ; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
3102 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
3103 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
3104 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
3105 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
3106 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
3107 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
3108 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
3109 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3110 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
3111 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
3112 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
3113 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
3114 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
3115 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
3116 ; SI-NOHSA-NEXT: s_endpgm
3118 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3119 ; GCNX3-HSA: ; %bb.0:
3120 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3121 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
3122 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3123 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3124 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
3125 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70
3126 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3127 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3128 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3129 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1]
3130 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x60
3131 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3132 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3133 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3134 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50
3135 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
3136 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3137 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3138 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3139 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
3140 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
3141 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3142 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
3143 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
3144 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
3145 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
3146 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3147 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
3148 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
3149 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9]
3150 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
3151 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
3152 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
3153 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
3154 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3155 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
3156 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3157 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
3158 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
3159 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
3160 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
3161 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3162 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1
3163 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0
3164 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
3165 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
3166 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
3167 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28
3168 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29
3169 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
3170 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
3171 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0
3172 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3173 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35]
3174 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3
3175 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2
3176 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0
3177 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31
3178 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30
3179 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30
3180 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31
3181 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3182 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35]
3183 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8)
3184 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25
3185 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3
3186 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2
3187 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0
3188 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3189 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3
3190 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2
3191 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0
3192 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24
3193 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24
3194 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25
3195 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3196 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
3197 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3
3198 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2
3199 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0
3200 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27
3201 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26
3202 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26
3203 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27
3204 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3205 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
3206 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3
3207 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2
3208 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0
3209 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3210 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3
3211 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2
3212 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80
3213 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
3214 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v21
3215 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20
3216 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20
3217 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21
3218 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3219 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23
3220 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22
3221 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v22
3222 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v23
3223 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27]
3224 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
3225 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
3226 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15
3227 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14
3228 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13
3229 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12
3230 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12
3231 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13
3232 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14
3233 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15
3234 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
3235 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
3236 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
3237 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
3238 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
3239 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
3240 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
3241 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
3242 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3243 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
3244 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
3245 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
3246 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
3247 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
3248 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
3249 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7
3250 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6
3251 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6
3252 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7
3253 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3254 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
3255 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
3256 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16
3257 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16
3258 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3
3259 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2
3260 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
3261 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17
3262 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17
3263 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3264 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7]
3265 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3
3266 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2
3267 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
3268 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19
3269 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18
3270 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18
3271 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19
3272 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3273 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26]
3274 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
3275 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9
3276 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8
3277 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8
3278 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9
3279 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
3280 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
3281 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
3282 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3283 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18]
3284 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
3285 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
3286 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
3287 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1
3288 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0
3289 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
3290 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11
3291 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0
3292 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1
3293 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3294 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3295 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3296 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
3297 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
3298 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
3299 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
3300 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3
3301 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2
3302 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
3303 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10
3304 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2
3305 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3
3306 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
3307 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
3308 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
3309 ; GCNX3-HSA-NEXT: s_endpgm
3311 ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
3312 ; GCNX3-NOHSA: ; %bb.0:
3313 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
3314 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
3315 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
3316 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
3317 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
3318 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
3319 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
3320 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
3321 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
3322 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
3323 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
3324 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
3325 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
3326 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3327 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
3328 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
3329 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
3330 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
3331 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
3332 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11
3333 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10
3334 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
3335 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
3336 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
3337 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13
3338 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12
3339 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12
3340 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13
3341 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14
3342 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15
3343 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
3344 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8
3345 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8
3346 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9
3347 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10
3348 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11
3349 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5)
3350 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
3351 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
3352 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5
3353 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4
3354 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4
3355 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5
3356 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
3357 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
3358 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4)
3359 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
3360 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
3361 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
3362 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
3363 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
3364 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
3365 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19
3366 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
3367 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
3368 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
3369 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0
3370 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0
3371 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1
3372 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
3373 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
3374 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
3375 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16
3376 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17
3377 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18
3378 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
3379 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
3380 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
3381 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v56, v20
3382 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v58, v21
3383 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v22
3384 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v23
3385 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
3386 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27
3387 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26
3388 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
3389 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
3390 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
3391 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24
3392 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
3393 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31
3394 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30
3395 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
3396 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31
3397 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29
3398 ; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28
3399 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28
3400 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29
3401 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
3402 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
3403 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
3404 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
3405 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
3406 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
3407 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3408 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
3409 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
3410 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
3411 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30
3412 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24
3413 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25
3414 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26
3415 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27
3416 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
3417 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0
3418 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
3419 ; GCNX3-NOHSA-NEXT: s_endpgm
3421 ; EG-LABEL: global_sextload_v32i32_to_v32i64:
3423 ; EG-NEXT: ALU 33, @36, KC0[CB0:0-32], KC1[]
3424 ; EG-NEXT: TEX 7 @20
3425 ; EG-NEXT: ALU 96, @70, KC0[CB0:0-32], KC1[]
3426 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0
3427 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
3428 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
3429 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
3430 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
3431 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0
3432 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0
3433 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0
3434 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0
3435 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0
3436 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0
3437 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
3438 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0
3439 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0
3440 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0
3441 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1
3443 ; EG-NEXT: Fetch clause starting at 20:
3444 ; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 112, #1
3445 ; EG-NEXT: VTX_READ_128 T13.XYZW, T11.X, 96, #1
3446 ; EG-NEXT: VTX_READ_128 T14.XYZW, T11.X, 80, #1
3447 ; EG-NEXT: VTX_READ_128 T15.XYZW, T11.X, 64, #1
3448 ; EG-NEXT: VTX_READ_128 T16.XYZW, T11.X, 48, #1
3449 ; EG-NEXT: VTX_READ_128 T17.XYZW, T11.X, 32, #1
3450 ; EG-NEXT: VTX_READ_128 T18.XYZW, T11.X, 16, #1
3451 ; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
3452 ; EG-NEXT: ALU clause starting at 36:
3453 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3454 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3455 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
3456 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3457 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3458 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3459 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
3460 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
3461 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3462 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
3463 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
3464 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3465 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
3466 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
3467 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3468 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
3469 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
3470 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3471 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
3472 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
3473 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3474 ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
3475 ; EG-NEXT: LSHR T7.X, PV.W, literal.x,
3476 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3477 ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
3478 ; EG-NEXT: LSHR T8.X, PV.W, literal.x,
3479 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3480 ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
3481 ; EG-NEXT: LSHR T9.X, PV.W, literal.x,
3482 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3483 ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
3484 ; EG-NEXT: LSHR T10.X, PV.W, literal.x,
3485 ; EG-NEXT: MOV * T11.X, KC0[2].Z,
3486 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3487 ; EG-NEXT: ALU clause starting at 70:
3488 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3489 ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
3490 ; EG-NEXT: LSHR T19.X, PV.W, literal.x,
3491 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3492 ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
3493 ; EG-NEXT: LSHR T20.X, PV.W, literal.x,
3494 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3495 ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
3496 ; EG-NEXT: LSHR T21.X, PV.W, literal.x,
3497 ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
3498 ; EG-NEXT: ASHR * T22.W, T11.W, literal.z,
3499 ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
3500 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3501 ; EG-NEXT: LSHR T23.X, PV.W, literal.x,
3502 ; EG-NEXT: ASHR T22.Y, T11.Z, literal.y,
3503 ; EG-NEXT: ASHR T24.W, T11.Y, literal.y,
3504 ; EG-NEXT: MOV * T22.X, T11.Z,
3505 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
3506 ; EG-NEXT: ASHR T24.Y, T11.X, literal.x,
3507 ; EG-NEXT: ASHR * T25.W, T18.W, literal.x,
3508 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3509 ; EG-NEXT: MOV T24.X, T11.X,
3510 ; EG-NEXT: ASHR T25.Y, T18.Z, literal.x,
3511 ; EG-NEXT: MOV T22.Z, T11.W,
3512 ; EG-NEXT: ASHR T26.W, T18.Y, literal.x,
3513 ; EG-NEXT: MOV * T25.X, T18.Z,
3514 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3515 ; EG-NEXT: ASHR T26.Y, T18.X, literal.x,
3516 ; EG-NEXT: MOV T24.Z, T11.Y,
3517 ; EG-NEXT: ASHR * T11.W, T17.W, literal.x,
3518 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3519 ; EG-NEXT: MOV T26.X, T18.X,
3520 ; EG-NEXT: ASHR T11.Y, T17.Z, literal.x,
3521 ; EG-NEXT: MOV T25.Z, T18.W,
3522 ; EG-NEXT: ASHR T27.W, T17.Y, literal.x,
3523 ; EG-NEXT: MOV * T11.X, T17.Z,
3524 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3525 ; EG-NEXT: ASHR T27.Y, T17.X, literal.x,
3526 ; EG-NEXT: MOV T26.Z, T18.Y,
3527 ; EG-NEXT: ASHR * T18.W, T16.W, literal.x,
3528 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3529 ; EG-NEXT: MOV T27.X, T17.X,
3530 ; EG-NEXT: ASHR T18.Y, T16.Z, literal.x,
3531 ; EG-NEXT: MOV T11.Z, T17.W,
3532 ; EG-NEXT: ASHR T28.W, T16.Y, literal.x,
3533 ; EG-NEXT: MOV * T18.X, T16.Z,
3534 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3535 ; EG-NEXT: ASHR T28.Y, T16.X, literal.x,
3536 ; EG-NEXT: MOV T27.Z, T17.Y,
3537 ; EG-NEXT: ASHR * T17.W, T15.W, literal.x,
3538 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3539 ; EG-NEXT: MOV T28.X, T16.X,
3540 ; EG-NEXT: ASHR T17.Y, T15.Z, literal.x,
3541 ; EG-NEXT: MOV T18.Z, T16.W,
3542 ; EG-NEXT: ASHR T29.W, T15.Y, literal.x,
3543 ; EG-NEXT: MOV * T17.X, T15.Z,
3544 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3545 ; EG-NEXT: ASHR T29.Y, T15.X, literal.x,
3546 ; EG-NEXT: MOV T28.Z, T16.Y,
3547 ; EG-NEXT: ASHR * T16.W, T14.W, literal.x,
3548 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3549 ; EG-NEXT: MOV T29.X, T15.X,
3550 ; EG-NEXT: ASHR T16.Y, T14.Z, literal.x,
3551 ; EG-NEXT: MOV T17.Z, T15.W,
3552 ; EG-NEXT: ASHR T30.W, T14.Y, literal.x,
3553 ; EG-NEXT: MOV * T16.X, T14.Z,
3554 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3555 ; EG-NEXT: ASHR T30.Y, T14.X, literal.x,
3556 ; EG-NEXT: MOV T29.Z, T15.Y,
3557 ; EG-NEXT: ASHR * T15.W, T13.W, literal.x,
3558 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3559 ; EG-NEXT: MOV T30.X, T14.X,
3560 ; EG-NEXT: ASHR T15.Y, T13.Z, literal.x,
3561 ; EG-NEXT: MOV T16.Z, T14.W,
3562 ; EG-NEXT: ASHR T31.W, T13.Y, literal.x,
3563 ; EG-NEXT: MOV * T15.X, T13.Z,
3564 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3565 ; EG-NEXT: ASHR T31.Y, T13.X, literal.x,
3566 ; EG-NEXT: MOV T30.Z, T14.Y,
3567 ; EG-NEXT: ASHR * T14.W, T12.W, literal.x,
3568 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3569 ; EG-NEXT: MOV T31.X, T13.X,
3570 ; EG-NEXT: ASHR T14.Y, T12.Z, literal.x,
3571 ; EG-NEXT: MOV T15.Z, T13.W,
3572 ; EG-NEXT: ASHR T32.W, T12.Y, literal.x,
3573 ; EG-NEXT: MOV * T14.X, T12.Z,
3574 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3575 ; EG-NEXT: ASHR T32.Y, T12.X, literal.x,
3576 ; EG-NEXT: MOV * T31.Z, T13.Y,
3577 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
3578 ; EG-NEXT: MOV T32.X, T12.X,
3579 ; EG-NEXT: MOV T14.Z, T12.W,
3580 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3581 ; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
3582 ; EG-NEXT: LSHR T12.X, PV.W, literal.x,
3583 ; EG-NEXT: MOV * T32.Z, T12.Y,
3584 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3586 ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3587 ; GCN-GFX900-HSA: ; %bb.0:
3588 ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3]
3589 ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1]
3590 ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3591 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
3592 ; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
3593 ; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
3594 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
3595 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
3596 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
3597 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
3598 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
3599 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
3600 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
3601 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
3602 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3
3603 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2
3604 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v2
3605 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v3
3606 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(4)
3607 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
3608 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
3609 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5
3610 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4
3611 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v4
3612 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v5
3613 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6
3614 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7
3615 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
3616 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
3617 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
3618 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
3619 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
3620 ; GCN-GFX900-HSA-NEXT: s_nop 0
3621 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
3622 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
3623 ; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
3624 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
3625 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
3626 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
3627 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10
3628 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9
3629 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v9
3630 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v10
3631 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v11
3632 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v12
3633 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6)
3634 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16
3635 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15
3636 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v14
3637 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v13
3638 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v13
3639 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v14
3640 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v9, v15
3641 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v11, v16
3642 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
3643 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
3644 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
3645 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v18
3646 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v17
3647 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v17
3648 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v18
3649 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19
3650 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
3651 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20
3652 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
3653 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
3654 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
3655 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
3656 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
3657 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21
3658 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22
3659 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23
3660 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24
3661 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3]
3662 ; GCN-GFX900-HSA-NEXT: s_nop 0
3663 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
3664 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
3665 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
3666 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
3667 ; GCN-GFX900-HSA-NEXT: s_nop 0
3668 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
3669 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
3670 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
3671 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
3672 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
3673 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
3674 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v50
3675 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v49
3676 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v49
3677 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v50
3678 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v51
3679 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v52
3680 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
3681 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v24
3682 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v23
3683 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v22
3684 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v21
3685 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v21
3686 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v22
3687 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
3688 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
3689 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
3690 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
3691 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
3692 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
3693 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
3694 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
3695 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
3696 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
3697 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
3698 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
3699 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
3700 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v23
3701 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v24
3702 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
3703 ; GCN-GFX900-HSA-NEXT: s_endpgm
3705 ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3706 ; GCN-GFX908-HSA: ; %bb.0:
3707 ; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3708 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0
3709 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0)
3710 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
3711 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
3712 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
3713 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
3714 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
3715 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
3716 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
3717 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6)
3718 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v2
3719 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3
3720 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2
3721 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v3
3722 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v25
3723 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v26
3724 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v27
3725 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v28
3726 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(4)
3727 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
3728 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
3729 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10
3730 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9
3731 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v37, v9
3732 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v39, v10
3733 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v11
3734 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v12
3735 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
3736 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16
3737 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15
3738 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v14
3739 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v13
3740 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v41, v13
3741 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v14
3742 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v9, v15
3743 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v11, v16
3744 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2)
3745 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
3746 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
3747 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v18
3748 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v17
3749 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v45, v17
3750 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v47, v18
3751 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v13, v19
3752 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v15, v20
3753 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1)
3754 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
3755 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
3756 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
3757 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
3758 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v53, v21
3759 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v55, v22
3760 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v23
3761 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24
3762 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3]
3763 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
3764 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5
3765 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4
3766 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v4
3767 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v5
3768 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
3769 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6
3770 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7
3771 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
3772 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
3773 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a3
3774 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
3775 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
3776 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v0
3777 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v1
3778 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a2
3779 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a1
3780 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v32, a0
3781 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
3782 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
3783 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
3784 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v50
3785 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v49
3786 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v49
3787 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v50
3788 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v51
3789 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v52
3790 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
3791 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
3792 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v24
3793 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v23
3794 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v22
3795 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v21
3796 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v21
3797 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v22
3798 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
3799 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
3800 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
3801 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
3802 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
3803 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
3804 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
3805 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
3806 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
3807 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
3808 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
3809 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
3810 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v28, v23
3811 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v30, v24
3812 ; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
3813 ; GCN-GFX908-HSA-NEXT: s_endpgm
3814 %ld = load <32 x i32>, ptr addrspace(1) %in
3815 %ext = sext <32 x i32> %ld to <32 x i64>
3816 store <32 x i64> %ext, ptr addrspace(1) %out
3820 define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3821 ; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
3822 ; SI-NOHSA: ; %bb.0:
3823 ; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
3824 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000
3825 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1
3826 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0
3827 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2
3828 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3
3829 ; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1
3830 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
3831 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6
3832 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7
3833 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4
3834 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5
3835 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
3836 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
3837 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
3838 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
3839 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3840 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
3841 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
3842 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
3843 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
3844 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
3845 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
3846 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
3847 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3848 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
3849 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
3850 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
3851 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0)
3852 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8
3853 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9
3854 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
3855 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3856 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10
3857 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11
3858 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
3859 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
3860 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32
3861 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v33
3862 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
3863 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3864 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v34
3865 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v35
3866 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
3867 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3868 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v28
3869 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v29
3870 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
3871 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3872 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v30
3873 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v31
3874 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
3875 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3876 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v24
3877 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v25
3878 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
3879 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3880 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v26
3881 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v27
3882 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3883 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3884 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v20
3885 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v21
3886 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
3887 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3888 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v22
3889 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v23
3890 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
3891 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3892 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v16
3893 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v17
3894 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
3895 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3896 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18
3897 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19
3898 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
3899 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3900 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12
3901 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13
3902 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3903 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
3904 ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14
3905 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15
3906 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3907 ; SI-NOHSA-NEXT: s_endpgm
3909 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
3910 ; GCNX3-HSA: ; %bb.0:
3911 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3912 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
3913 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
3914 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
3915 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
3916 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3917 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
3918 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3919 ; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48
3920 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
3921 ; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0
3922 ; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64
3923 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0
3924 ; GCNX3-HSA-NEXT: s_add_u32 s12, s2, 0x50
3925 ; GCNX3-HSA-NEXT: s_addc_u32 s13, s3, 0
3926 ; GCNX3-HSA-NEXT: s_add_u32 s14, s2, 0x60
3927 ; GCNX3-HSA-NEXT: s_addc_u32 s15, s3, 0
3928 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x70
3929 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
3930 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
3931 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
3932 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1]
3933 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s14
3934 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s15
3935 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1]
3936 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s12
3937 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s13
3938 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
3939 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s10
3940 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s11
3941 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1]
3942 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8
3943 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9
3944 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
3945 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
3946 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
3947 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
3948 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
3949 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
3950 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
3951 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0
3952 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
3953 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1
3954 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3955 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
3956 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28
3957 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29
3958 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1
3959 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0
3960 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
3961 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
3962 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
3963 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0
3964 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3965 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30
3966 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31
3967 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
3968 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
3969 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0
3970 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3971 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
3972 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
3973 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
3974 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0
3975 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3976 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8)
3977 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32
3978 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33
3979 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
3980 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
3981 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
3982 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0
3983 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3984 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3
3985 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2
3986 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0
3987 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3988 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34
3989 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35
3990 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3
3991 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2
3992 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0
3993 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3994 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
3995 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
3996 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
3997 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80
3998 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
3999 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
4000 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24
4001 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25
4002 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
4003 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
4004 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
4005 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4006 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
4007 ; GCNX3-HSA-NEXT: s_nop 0
4008 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26
4009 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27
4010 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
4011 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
4012 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
4013 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4014 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
4015 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
4016 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
4017 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20
4018 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21
4019 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3]
4020 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
4021 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22
4022 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23
4023 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
4024 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
4025 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4026 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11)
4027 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16
4028 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17
4029 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
4030 ; GCNX3-HSA-NEXT: s_nop 0
4031 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18
4032 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19
4033 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
4034 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
4035 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12
4036 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13
4037 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
4038 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
4039 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
4040 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
4041 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4042 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14
4043 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15
4044 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
4045 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
4046 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8
4047 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9
4048 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
4049 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
4050 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
4051 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4052 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
4053 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
4054 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
4055 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
4056 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10
4057 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11
4058 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4059 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
4060 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
4061 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4
4062 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5
4063 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
4064 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
4065 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
4066 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4067 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
4068 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6
4069 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v7
4070 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0
4071 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4072 ; GCNX3-HSA-NEXT: s_endpgm
4074 ; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
4075 ; GCNX3-NOHSA: ; %bb.0:
4076 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
4077 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
4078 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
4079 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
4080 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
4081 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
4082 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
4083 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
4084 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
4085 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
4086 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
4087 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
4088 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
4089 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
4090 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
4091 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0
4092 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v29, 0
4093 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v31, v29
4094 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
4095 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
4096 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4097 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v0
4098 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v1
4099 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224
4100 ; GCNX3-NOHSA-NEXT: s_nop 0
4101 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v2
4102 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v3
4103 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240
4104 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(8)
4105 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v4
4106 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v5
4107 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192
4108 ; GCNX3-NOHSA-NEXT: s_nop 0
4109 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v6
4110 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v7
4111 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208
4112 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(9)
4113 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8
4114 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9
4115 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
4116 ; GCNX3-NOHSA-NEXT: s_nop 0
4117 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v10
4118 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v11
4119 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
4120 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(10)
4121 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v12
4122 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v13
4123 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128
4124 ; GCNX3-NOHSA-NEXT: s_nop 0
4125 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v14
4126 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v15
4127 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
4128 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(11)
4129 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v16
4130 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v17
4131 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
4132 ; GCNX3-NOHSA-NEXT: s_nop 0
4133 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v18
4134 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v19
4135 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
4136 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(12)
4137 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v20
4138 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v21
4139 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
4140 ; GCNX3-NOHSA-NEXT: s_nop 0
4141 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v22
4142 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v23
4143 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
4144 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(13)
4145 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v24
4146 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v25
4147 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
4148 ; GCNX3-NOHSA-NEXT: s_nop 0
4149 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v26
4150 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v27
4151 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
4152 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(14)
4153 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v32
4154 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v33
4155 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
4156 ; GCNX3-NOHSA-NEXT: s_nop 0
4157 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v34
4158 ; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v35
4159 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
4160 ; GCNX3-NOHSA-NEXT: s_endpgm
4162 ; EG-LABEL: global_zextload_v32i32_to_v32i64:
4164 ; EG-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
4165 ; EG-NEXT: TEX 2 @22
4166 ; EG-NEXT: ALU 10, @39, KC0[], KC1[]
4167 ; EG-NEXT: TEX 4 @28
4168 ; EG-NEXT: ALU 100, @50, KC0[CB0:0-32], KC1[]
4169 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0
4170 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0
4171 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0
4172 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0
4173 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0
4174 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0
4175 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
4176 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
4177 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0
4178 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
4179 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0
4180 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0
4181 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0
4182 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0
4183 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0
4184 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1
4186 ; EG-NEXT: Fetch clause starting at 22:
4187 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 112, #1
4188 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 80, #1
4189 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 96, #1
4190 ; EG-NEXT: Fetch clause starting at 28:
4191 ; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1
4192 ; EG-NEXT: VTX_READ_128 T11.XYZW, T0.X, 16, #1
4193 ; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 32, #1
4194 ; EG-NEXT: VTX_READ_128 T13.XYZW, T0.X, 48, #1
4195 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 64, #1
4196 ; EG-NEXT: ALU clause starting at 38:
4197 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
4198 ; EG-NEXT: ALU clause starting at 39:
4199 ; EG-NEXT: MOV T4.X, T1.X,
4200 ; EG-NEXT: MOV T4.Y, 0.0,
4201 ; EG-NEXT: MOV * T5.X, T1.Z,
4202 ; EG-NEXT: MOV * T5.Y, 0.0,
4203 ; EG-NEXT: MOV T6.X, T3.X,
4204 ; EG-NEXT: MOV T6.Y, 0.0,
4205 ; EG-NEXT: MOV * T7.X, T3.Z,
4206 ; EG-NEXT: MOV * T7.Y, 0.0,
4207 ; EG-NEXT: MOV T8.X, T2.X,
4208 ; EG-NEXT: MOV T8.Y, 0.0,
4209 ; EG-NEXT: MOV * T9.X, T2.Z,
4210 ; EG-NEXT: ALU clause starting at 50:
4211 ; EG-NEXT: MOV * T9.Y, 0.0,
4212 ; EG-NEXT: MOV T14.X, T0.X,
4213 ; EG-NEXT: MOV T14.Y, 0.0,
4214 ; EG-NEXT: MOV * T15.X, T0.Z,
4215 ; EG-NEXT: MOV * T15.Y, 0.0,
4216 ; EG-NEXT: MOV T16.X, T13.X,
4217 ; EG-NEXT: MOV T16.Y, 0.0,
4218 ; EG-NEXT: MOV * T17.X, T13.Z,
4219 ; EG-NEXT: MOV * T17.Y, 0.0,
4220 ; EG-NEXT: MOV T18.X, T12.X,
4221 ; EG-NEXT: MOV T18.Y, 0.0,
4222 ; EG-NEXT: MOV * T19.X, T12.Z,
4223 ; EG-NEXT: MOV * T19.Y, 0.0,
4224 ; EG-NEXT: MOV T20.X, T11.X,
4225 ; EG-NEXT: MOV T20.Y, 0.0,
4226 ; EG-NEXT: MOV * T21.X, T11.Z,
4227 ; EG-NEXT: MOV * T21.Y, 0.0,
4228 ; EG-NEXT: MOV T22.X, T10.X,
4229 ; EG-NEXT: MOV T22.Y, 0.0,
4230 ; EG-NEXT: MOV * T23.X, T10.Z,
4231 ; EG-NEXT: MOV T23.Y, 0.0,
4232 ; EG-NEXT: MOV T4.Z, T1.Y,
4233 ; EG-NEXT: MOV T4.W, 0.0,
4234 ; EG-NEXT: MOV * T5.Z, T1.W,
4235 ; EG-NEXT: MOV * T5.W, 0.0,
4236 ; EG-NEXT: MOV T6.Z, T3.Y,
4237 ; EG-NEXT: MOV T6.W, 0.0,
4238 ; EG-NEXT: MOV * T7.Z, T3.W,
4239 ; EG-NEXT: MOV * T7.W, 0.0,
4240 ; EG-NEXT: MOV T8.Z, T2.Y,
4241 ; EG-NEXT: MOV T8.W, 0.0,
4242 ; EG-NEXT: MOV * T9.Z, T2.W,
4243 ; EG-NEXT: MOV * T9.W, 0.0,
4244 ; EG-NEXT: MOV T14.Z, T0.Y,
4245 ; EG-NEXT: MOV T14.W, 0.0,
4246 ; EG-NEXT: MOV * T15.Z, T0.W,
4247 ; EG-NEXT: MOV * T15.W, 0.0,
4248 ; EG-NEXT: MOV T16.Z, T13.Y,
4249 ; EG-NEXT: MOV T16.W, 0.0,
4250 ; EG-NEXT: MOV * T17.Z, T13.W,
4251 ; EG-NEXT: MOV * T17.W, 0.0,
4252 ; EG-NEXT: MOV T18.Z, T12.Y,
4253 ; EG-NEXT: MOV T18.W, 0.0,
4254 ; EG-NEXT: MOV * T19.Z, T12.W,
4255 ; EG-NEXT: MOV * T19.W, 0.0,
4256 ; EG-NEXT: MOV T20.Z, T11.Y,
4257 ; EG-NEXT: MOV T20.W, 0.0,
4258 ; EG-NEXT: MOV * T21.Z, T11.W,
4259 ; EG-NEXT: MOV * T21.W, 0.0,
4260 ; EG-NEXT: MOV T22.Z, T10.Y,
4261 ; EG-NEXT: MOV T22.W, 0.0,
4262 ; EG-NEXT: MOV * T23.Z, T10.W,
4263 ; EG-NEXT: MOV T23.W, 0.0,
4264 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4265 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4266 ; EG-NEXT: LSHR T0.X, PS, literal.x,
4267 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4268 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4269 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4270 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4271 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
4272 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4273 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
4274 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
4275 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4276 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
4277 ; EG-NEXT: LSHR T10.X, PV.W, literal.x,
4278 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4279 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
4280 ; EG-NEXT: LSHR T11.X, PV.W, literal.x,
4281 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4282 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
4283 ; EG-NEXT: LSHR T12.X, PV.W, literal.x,
4284 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4285 ; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
4286 ; EG-NEXT: LSHR T13.X, PV.W, literal.x,
4287 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4288 ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
4289 ; EG-NEXT: LSHR T24.X, PV.W, literal.x,
4290 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4291 ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
4292 ; EG-NEXT: LSHR T25.X, PV.W, literal.x,
4293 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4294 ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
4295 ; EG-NEXT: LSHR T26.X, PV.W, literal.x,
4296 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4297 ; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
4298 ; EG-NEXT: LSHR T27.X, PV.W, literal.x,
4299 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4300 ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
4301 ; EG-NEXT: LSHR T28.X, PV.W, literal.x,
4302 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4303 ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
4304 ; EG-NEXT: LSHR T29.X, PV.W, literal.x,
4305 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4306 ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
4307 ; EG-NEXT: LSHR T30.X, PV.W, literal.x,
4308 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4309 ; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
4310 ; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
4311 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4313 ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64:
4315 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4316 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
4317 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
4318 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
4319 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:112
4320 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:96
4321 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:80
4322 ; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:64
4323 ; GCN-HSA-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:48
4324 ; GCN-HSA-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:32
4325 ; GCN-HSA-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:16
4326 ; GCN-HSA-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3]
4327 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4328 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4
4329 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5
4330 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:224
4331 ; GCN-HSA-NEXT: s_nop 0
4332 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6
4333 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7
4334 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:240
4335 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
4336 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8
4337 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9
4338 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:192
4339 ; GCN-HSA-NEXT: s_nop 0
4340 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
4341 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
4342 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:208
4343 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
4344 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v12
4345 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v13
4346 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:160
4347 ; GCN-HSA-NEXT: s_nop 0
4348 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v14
4349 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v15
4350 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:176
4351 ; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
4352 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v16
4353 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v17
4354 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:128
4355 ; GCN-HSA-NEXT: s_nop 0
4356 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v18
4357 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v19
4358 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:144
4359 ; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
4360 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v20
4361 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v21
4362 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
4363 ; GCN-HSA-NEXT: s_nop 0
4364 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v22
4365 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v23
4366 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
4367 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
4368 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v24
4369 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v25
4370 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
4371 ; GCN-HSA-NEXT: s_nop 0
4372 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v26
4373 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v27
4374 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
4375 ; GCN-HSA-NEXT: s_waitcnt vmcnt(13)
4376 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v28
4377 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v29
4378 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
4379 ; GCN-HSA-NEXT: s_nop 0
4380 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v30
4381 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v31
4382 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
4383 ; GCN-HSA-NEXT: s_waitcnt vmcnt(14)
4384 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v32
4385 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v33
4386 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
4387 ; GCN-HSA-NEXT: s_nop 0
4388 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, v34
4389 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, v35
4390 ; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
4391 ; GCN-HSA-NEXT: s_endpgm
4392 %ld = load <32 x i32>, ptr addrspace(1) %in
4393 %ext = zext <32 x i32> %ld to <32 x i64>
4394 store <32 x i64> %ext, ptr addrspace(1) %out
4398 define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
4399 ; SI-NOHSA-LABEL: global_load_v32i32:
4400 ; SI-NOHSA: ; %bb.0:
4401 ; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4402 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
4403 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1
4404 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6
4405 ; SI-NOHSA-NEXT: s_mov_b32 s11, s7
4406 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
4407 ; SI-NOHSA-NEXT: s_mov_b32 s4, s0
4408 ; SI-NOHSA-NEXT: s_mov_b32 s5, s1
4409 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2
4410 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3
4411 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
4412 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
4413 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
4414 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
4415 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
4416 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
4417 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
4418 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
4419 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
4420 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
4421 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
4422 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
4423 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
4424 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
4425 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
4426 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
4427 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
4428 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
4429 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
4430 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
4431 ; SI-NOHSA-NEXT: s_endpgm
4433 ; GCNX3-HSA-LABEL: global_load_v32i32:
4434 ; GCNX3-HSA: ; %bb.0:
4435 ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4436 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
4437 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
4438 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4439 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48
4440 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
4441 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5
4442 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
4443 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
4444 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4
4445 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
4446 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
4447 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4448 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
4449 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
4450 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50
4451 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4452 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
4453 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
4454 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
4455 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4456 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5
4457 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4
4458 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70
4459 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6
4460 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
4461 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7
4462 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60
4463 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
4464 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
4465 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
4466 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5
4467 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
4468 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4
4469 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
4470 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
4471 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
4472 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
4473 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
4474 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
4475 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
4476 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1
4477 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4478 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0
4479 ; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70
4480 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0
4481 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4482 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
4483 ; GCNX3-HSA-NEXT: s_nop 0
4484 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
4485 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
4486 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
4487 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
4488 ; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 0x50
4489 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0
4490 ; GCNX3-HSA-NEXT: s_add_u32 s8, s0, 32
4491 ; GCNX3-HSA-NEXT: s_addc_u32 s9, s1, 0
4492 ; GCNX3-HSA-NEXT: s_add_u32 s10, s0, 48
4493 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s1, 0
4494 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s10
4495 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s11
4496 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
4497 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
4498 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4499 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
4500 ; GCNX3-HSA-NEXT: s_nop 0
4501 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s8
4502 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s9
4503 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4504 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11]
4505 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6
4506 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4
4507 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
4508 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7
4509 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
4510 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5
4511 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
4512 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
4513 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4514 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15]
4515 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4516 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
4517 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4518 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23]
4519 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4520 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
4521 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
4522 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[28:31]
4523 ; GCNX3-HSA-NEXT: s_endpgm
4525 ; GCNX3-NOHSA-LABEL: global_load_v32i32:
4526 ; GCNX3-NOHSA: ; %bb.0:
4527 ; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4528 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
4529 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
4530 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
4531 ; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
4532 ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
4533 ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
4534 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
4535 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
4536 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
4537 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
4538 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
4539 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
4540 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
4541 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
4542 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
4543 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
4544 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
4545 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
4546 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
4547 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
4548 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
4549 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
4550 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
4551 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4552 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
4553 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4554 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
4555 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4556 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
4557 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
4558 ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
4559 ; GCNX3-NOHSA-NEXT: s_endpgm
4561 ; EG-LABEL: global_load_v32i32:
4563 ; EG-NEXT: ALU 23, @28, KC0[CB0:0-32], KC1[]
4564 ; EG-NEXT: TEX 7 @12
4565 ; EG-NEXT: ALU 1, @52, KC0[], KC1[]
4566 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
4567 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
4568 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
4569 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
4570 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T3.X, 0
4571 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T2.X, 0
4572 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T1.X, 0
4573 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
4575 ; EG-NEXT: Fetch clause starting at 12:
4576 ; EG-NEXT: VTX_READ_128 T8.XYZW, T7.X, 96, #1
4577 ; EG-NEXT: VTX_READ_128 T9.XYZW, T7.X, 112, #1
4578 ; EG-NEXT: VTX_READ_128 T10.XYZW, T7.X, 64, #1
4579 ; EG-NEXT: VTX_READ_128 T11.XYZW, T7.X, 80, #1
4580 ; EG-NEXT: VTX_READ_128 T12.XYZW, T7.X, 32, #1
4581 ; EG-NEXT: VTX_READ_128 T13.XYZW, T7.X, 48, #1
4582 ; EG-NEXT: VTX_READ_128 T14.XYZW, T7.X, 0, #1
4583 ; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 16, #1
4584 ; EG-NEXT: ALU clause starting at 28:
4585 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4586 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4587 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
4588 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4589 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4590 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4591 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4592 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
4593 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4594 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
4595 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
4596 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4597 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
4598 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
4599 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4600 ; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
4601 ; EG-NEXT: LSHR T5.X, PV.W, literal.x,
4602 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4603 ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
4604 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
4605 ; EG-NEXT: MOV * T7.X, KC0[2].Z,
4606 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4607 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4608 ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
4609 ; EG-NEXT: ALU clause starting at 52:
4610 ; EG-NEXT: LSHR * T15.X, T0.W, literal.x,
4611 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4613 ; GCN-HSA-LABEL: global_load_v32i32:
4615 ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4616 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0
4617 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
4618 ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96
4619 ; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:112
4620 ; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:64
4621 ; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:80
4622 ; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:32
4623 ; GCN-HSA-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:48
4624 ; GCN-HSA-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3]
4625 ; GCN-HSA-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:16
4626 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4627 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
4628 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4629 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
4630 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4631 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
4632 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4633 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
4634 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4635 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
4636 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4637 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
4638 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4639 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
4640 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
4641 ; GCN-HSA-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16
4642 ; GCN-HSA-NEXT: s_endpgm
4643 %ld = load <32 x i32>, ptr addrspace(1) %in
4644 store <32 x i32> %ld, ptr addrspace(1) %out
4648 attributes #0 = { nounwind }