1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
5 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
8 define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
9 ; GFX6-LABEL: constant_load_i64:
11 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
12 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
14 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
15 ; GFX6-NEXT: s_mov_b32 s2, -1
16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
17 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
18 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
19 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
22 ; GFX7-LABEL: constant_load_i64:
24 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
25 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
26 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
27 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
28 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
29 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
30 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
31 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
32 ; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
35 ; GFX8-LABEL: constant_load_i64:
37 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
38 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
40 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
41 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
42 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
44 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
45 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
48 ; EG-LABEL: constant_load_i64:
50 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
52 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
53 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
56 ; EG-NEXT: Fetch clause starting at 6:
57 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
58 ; EG-NEXT: ALU clause starting at 8:
59 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
60 ; EG-NEXT: ALU clause starting at 9:
61 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
62 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
64 ; GFX12-LABEL: constant_load_i64:
66 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
67 ; GFX12-NEXT: s_wait_kmcnt 0x0
68 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
69 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
70 ; GFX12-NEXT: s_wait_kmcnt 0x0
71 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
72 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
73 ; GFX12-NEXT: s_endpgm
74 %ld = load i64, ptr addrspace(4) %in
75 store i64 %ld, ptr addrspace(1) %out
79 define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
80 ; GFX6-LABEL: constant_load_v2i64:
81 ; GFX6: ; %bb.0: ; %entry
82 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
83 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
84 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
85 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
86 ; GFX6-NEXT: s_mov_b32 s2, -1
87 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
88 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
89 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
90 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
91 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
92 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
95 ; GFX7-LABEL: constant_load_v2i64:
96 ; GFX7: ; %bb.0: ; %entry
97 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
98 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
99 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
100 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
101 ; GFX7-NEXT: v_mov_b32_e32 v5, s1
102 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
103 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
104 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
105 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
106 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
107 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
108 ; GFX7-NEXT: s_endpgm
110 ; GFX8-LABEL: constant_load_v2i64:
111 ; GFX8: ; %bb.0: ; %entry
112 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
113 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
114 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
115 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
116 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
117 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
119 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
120 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
121 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
122 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
123 ; GFX8-NEXT: s_endpgm
125 ; EG-LABEL: constant_load_v2i64:
126 ; EG: ; %bb.0: ; %entry
127 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
129 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
130 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
133 ; EG-NEXT: Fetch clause starting at 6:
134 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
135 ; EG-NEXT: ALU clause starting at 8:
136 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
137 ; EG-NEXT: ALU clause starting at 9:
138 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
139 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
141 ; GFX12-LABEL: constant_load_v2i64:
142 ; GFX12: ; %bb.0: ; %entry
143 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
144 ; GFX12-NEXT: s_wait_kmcnt 0x0
145 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
146 ; GFX12-NEXT: v_mov_b32_e32 v4, 0
147 ; GFX12-NEXT: s_wait_kmcnt 0x0
148 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
149 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
150 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
151 ; GFX12-NEXT: s_endpgm
153 %ld = load <2 x i64>, ptr addrspace(4) %in
154 store <2 x i64> %ld, ptr addrspace(1) %out
158 define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
159 ; GFX6-LABEL: constant_load_v3i64:
160 ; GFX6: ; %bb.0: ; %entry
161 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
162 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
164 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
165 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
166 ; GFX6-NEXT: s_mov_b32 s2, -1
167 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
168 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
169 ; GFX6-NEXT: v_mov_b32_e32 v1, s9
170 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
171 ; GFX6-NEXT: s_waitcnt expcnt(0)
172 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
173 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
174 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
175 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
177 ; GFX6-NEXT: s_endpgm
179 ; GFX7-LABEL: constant_load_v3i64:
180 ; GFX7: ; %bb.0: ; %entry
181 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
182 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
184 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
185 ; GFX7-NEXT: s_add_u32 s2, s0, 16
186 ; GFX7-NEXT: s_addc_u32 s3, s1, 0
187 ; GFX7-NEXT: v_mov_b32_e32 v4, s3
188 ; GFX7-NEXT: v_mov_b32_e32 v3, s2
189 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX7-NEXT: v_mov_b32_e32 v5, s8
191 ; GFX7-NEXT: v_mov_b32_e32 v6, s9
192 ; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
193 ; GFX7-NEXT: v_mov_b32_e32 v5, s1
194 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
195 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
196 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
197 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
198 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
199 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
200 ; GFX7-NEXT: s_endpgm
202 ; GFX8-LABEL: constant_load_v3i64:
203 ; GFX8: ; %bb.0: ; %entry
204 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
205 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10
207 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
208 ; GFX8-NEXT: s_add_u32 s2, s0, 16
209 ; GFX8-NEXT: s_addc_u32 s3, s1, 0
210 ; GFX8-NEXT: v_mov_b32_e32 v4, s3
211 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
212 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
213 ; GFX8-NEXT: v_mov_b32_e32 v5, s8
214 ; GFX8-NEXT: v_mov_b32_e32 v6, s9
215 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
216 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
217 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
218 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
219 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
220 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
221 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
222 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
223 ; GFX8-NEXT: s_endpgm
225 ; EG-LABEL: constant_load_v3i64:
226 ; EG: ; %bb.0: ; %entry
227 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
229 ; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[]
230 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
232 ; EG-NEXT: ALU 3, @15, KC0[CB0:0-32], KC1[]
233 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
235 ; EG-NEXT: Fetch clause starting at 8:
236 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
237 ; EG-NEXT: Fetch clause starting at 10:
238 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
239 ; EG-NEXT: ALU clause starting at 12:
240 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
241 ; EG-NEXT: ALU clause starting at 13:
242 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
243 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
244 ; EG-NEXT: ALU clause starting at 15:
245 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
246 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
247 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
248 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
250 ; GFX12-LABEL: constant_load_v3i64:
251 ; GFX12: ; %bb.0: ; %entry
252 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
253 ; GFX12-NEXT: s_wait_kmcnt 0x0
254 ; GFX12-NEXT: s_clause 0x1
255 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10
256 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
257 ; GFX12-NEXT: s_wait_kmcnt 0x0
258 ; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
259 ; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
260 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
261 ; GFX12-NEXT: v_mov_b32_e32 v2, s6
262 ; GFX12-NEXT: s_clause 0x1
263 ; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
264 ; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
265 ; GFX12-NEXT: s_endpgm
267 %ld = load <3 x i64>, ptr addrspace(4) %in
268 store <3 x i64> %ld, ptr addrspace(1) %out
272 define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
273 ; GFX6-LABEL: constant_load_v4i64:
274 ; GFX6: ; %bb.0: ; %entry
275 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
276 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
277 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
278 ; GFX6-NEXT: s_mov_b32 s11, 0xf000
279 ; GFX6-NEXT: s_mov_b32 s10, -1
280 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
282 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
283 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
284 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
285 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
286 ; GFX6-NEXT: s_waitcnt expcnt(0)
287 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
288 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
289 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
290 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
291 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
292 ; GFX6-NEXT: s_endpgm
294 ; GFX7-LABEL: constant_load_v4i64:
295 ; GFX7: ; %bb.0: ; %entry
296 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
297 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
298 ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
299 ; GFX7-NEXT: s_add_u32 s10, s8, 16
300 ; GFX7-NEXT: s_addc_u32 s11, s9, 0
301 ; GFX7-NEXT: v_mov_b32_e32 v6, s10
302 ; GFX7-NEXT: v_mov_b32_e32 v7, s11
303 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
304 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
305 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
306 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
307 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
308 ; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
309 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
310 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
311 ; GFX7-NEXT: v_mov_b32_e32 v5, s1
312 ; GFX7-NEXT: v_mov_b32_e32 v6, s2
313 ; GFX7-NEXT: v_mov_b32_e32 v7, s3
314 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
315 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
316 ; GFX7-NEXT: s_endpgm
318 ; GFX8-LABEL: constant_load_v4i64:
319 ; GFX8: ; %bb.0: ; %entry
320 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
321 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
323 ; GFX8-NEXT: s_add_u32 s10, s8, 16
324 ; GFX8-NEXT: s_addc_u32 s11, s9, 0
325 ; GFX8-NEXT: v_mov_b32_e32 v6, s10
326 ; GFX8-NEXT: v_mov_b32_e32 v7, s11
327 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
329 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
330 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
331 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
332 ; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
333 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
334 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
335 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
336 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
337 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
338 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
339 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
340 ; GFX8-NEXT: s_endpgm
342 ; EG-LABEL: constant_load_v4i64:
343 ; EG: ; %bb.0: ; %entry
344 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
346 ; EG-NEXT: ALU 3, @13, KC0[CB0:0-32], KC1[]
347 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
349 ; EG-NEXT: ALU 1, @17, KC0[CB0:0-32], KC1[]
350 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
352 ; EG-NEXT: Fetch clause starting at 8:
353 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
354 ; EG-NEXT: Fetch clause starting at 10:
355 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
356 ; EG-NEXT: ALU clause starting at 12:
357 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
358 ; EG-NEXT: ALU clause starting at 13:
359 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
360 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
361 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
362 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
363 ; EG-NEXT: ALU clause starting at 17:
364 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
365 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
367 ; GFX12-LABEL: constant_load_v4i64:
368 ; GFX12: ; %bb.0: ; %entry
369 ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
370 ; GFX12-NEXT: s_wait_kmcnt 0x0
371 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
372 ; GFX12-NEXT: s_wait_kmcnt 0x0
373 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
374 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
375 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
376 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
377 ; GFX12-NEXT: v_mov_b32_e32 v6, s2
378 ; GFX12-NEXT: s_clause 0x1
379 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
380 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
381 ; GFX12-NEXT: s_endpgm
383 %ld = load <4 x i64>, ptr addrspace(4) %in
384 store <4 x i64> %ld, ptr addrspace(1) %out
388 define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
389 ; GFX6-LABEL: constant_load_v8i64:
390 ; GFX6: ; %bb.0: ; %entry
391 ; GFX6-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
392 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
393 ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
394 ; GFX6-NEXT: s_mov_b32 s19, 0xf000
395 ; GFX6-NEXT: s_mov_b32 s18, -1
396 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX6-NEXT: v_mov_b32_e32 v0, s12
398 ; GFX6-NEXT: v_mov_b32_e32 v1, s13
399 ; GFX6-NEXT: v_mov_b32_e32 v2, s14
400 ; GFX6-NEXT: v_mov_b32_e32 v3, s15
401 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
402 ; GFX6-NEXT: s_waitcnt expcnt(0)
403 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
404 ; GFX6-NEXT: v_mov_b32_e32 v1, s9
405 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
406 ; GFX6-NEXT: v_mov_b32_e32 v3, s11
407 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
408 ; GFX6-NEXT: s_waitcnt expcnt(0)
409 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
410 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
411 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
412 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
413 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
414 ; GFX6-NEXT: s_waitcnt expcnt(0)
415 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
416 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
417 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
418 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
419 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
420 ; GFX6-NEXT: s_endpgm
422 ; GFX7-LABEL: constant_load_v8i64:
423 ; GFX7: ; %bb.0: ; %entry
424 ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
425 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
427 ; GFX7-NEXT: s_add_u32 s18, s16, 48
428 ; GFX7-NEXT: s_addc_u32 s19, s17, 0
429 ; GFX7-NEXT: v_mov_b32_e32 v6, s18
430 ; GFX7-NEXT: v_mov_b32_e32 v7, s19
431 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
432 ; GFX7-NEXT: v_mov_b32_e32 v0, s12
433 ; GFX7-NEXT: v_mov_b32_e32 v1, s13
434 ; GFX7-NEXT: v_mov_b32_e32 v2, s14
435 ; GFX7-NEXT: v_mov_b32_e32 v3, s15
436 ; GFX7-NEXT: v_mov_b32_e32 v4, s8
437 ; GFX7-NEXT: s_add_u32 s8, s16, 32
438 ; GFX7-NEXT: v_mov_b32_e32 v5, s9
439 ; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
440 ; GFX7-NEXT: s_addc_u32 s9, s17, 0
441 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
442 ; GFX7-NEXT: v_mov_b32_e32 v6, s10
443 ; GFX7-NEXT: v_mov_b32_e32 v7, s11
444 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
445 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
446 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
447 ; GFX7-NEXT: s_add_u32 s4, s16, 16
448 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
449 ; GFX7-NEXT: s_addc_u32 s5, s17, 0
450 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
451 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
452 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
453 ; GFX7-NEXT: v_mov_b32_e32 v5, s5
454 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
455 ; GFX7-NEXT: v_mov_b32_e32 v4, s16
456 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
457 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
458 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
459 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
460 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
461 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
462 ; GFX7-NEXT: s_endpgm
464 ; GFX8-LABEL: constant_load_v8i64:
465 ; GFX8: ; %bb.0: ; %entry
466 ; GFX8-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
467 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
468 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
469 ; GFX8-NEXT: s_add_u32 s18, s16, 48
470 ; GFX8-NEXT: s_addc_u32 s19, s17, 0
471 ; GFX8-NEXT: v_mov_b32_e32 v6, s18
472 ; GFX8-NEXT: v_mov_b32_e32 v7, s19
473 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
474 ; GFX8-NEXT: v_mov_b32_e32 v0, s12
475 ; GFX8-NEXT: v_mov_b32_e32 v1, s13
476 ; GFX8-NEXT: v_mov_b32_e32 v2, s14
477 ; GFX8-NEXT: v_mov_b32_e32 v3, s15
478 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
479 ; GFX8-NEXT: s_add_u32 s8, s16, 32
480 ; GFX8-NEXT: v_mov_b32_e32 v5, s9
481 ; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
482 ; GFX8-NEXT: s_addc_u32 s9, s17, 0
483 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
484 ; GFX8-NEXT: v_mov_b32_e32 v6, s10
485 ; GFX8-NEXT: v_mov_b32_e32 v7, s11
486 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
487 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
488 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
489 ; GFX8-NEXT: s_add_u32 s4, s16, 16
490 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
491 ; GFX8-NEXT: s_addc_u32 s5, s17, 0
492 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
493 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
494 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
495 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
496 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
497 ; GFX8-NEXT: v_mov_b32_e32 v4, s16
498 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
499 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
500 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
501 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
502 ; GFX8-NEXT: v_mov_b32_e32 v5, s17
503 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
504 ; GFX8-NEXT: s_endpgm
506 ; EG-LABEL: constant_load_v8i64:
507 ; EG: ; %bb.0: ; %entry
508 ; EG-NEXT: ALU 0, @22, KC0[CB0:0-32], KC1[]
510 ; EG-NEXT: ALU 3, @23, KC0[CB0:0-32], KC1[]
511 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
513 ; EG-NEXT: ALU 3, @27, KC0[CB0:0-32], KC1[]
514 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
516 ; EG-NEXT: ALU 3, @31, KC0[CB0:0-32], KC1[]
517 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
519 ; EG-NEXT: ALU 1, @35, KC0[CB0:0-32], KC1[]
520 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
522 ; EG-NEXT: Fetch clause starting at 14:
523 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
524 ; EG-NEXT: Fetch clause starting at 16:
525 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1
526 ; EG-NEXT: Fetch clause starting at 18:
527 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
528 ; EG-NEXT: Fetch clause starting at 20:
529 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
530 ; EG-NEXT: ALU clause starting at 22:
531 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
532 ; EG-NEXT: ALU clause starting at 23:
533 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
534 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
535 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
536 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
537 ; EG-NEXT: ALU clause starting at 27:
538 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
539 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
540 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
541 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
542 ; EG-NEXT: ALU clause starting at 31:
543 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
544 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
545 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
546 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
547 ; EG-NEXT: ALU clause starting at 35:
548 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
549 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
551 ; GFX12-LABEL: constant_load_v8i64:
552 ; GFX12: ; %bb.0: ; %entry
553 ; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
554 ; GFX12-NEXT: s_wait_kmcnt 0x0
555 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
556 ; GFX12-NEXT: s_wait_kmcnt 0x0
557 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
558 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
559 ; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
560 ; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11
561 ; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5
562 ; GFX12-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7
563 ; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1
564 ; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3
565 ; GFX12-NEXT: v_mov_b32_e32 v14, s2
566 ; GFX12-NEXT: s_clause 0x3
567 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48
568 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32
569 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
570 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17]
571 ; GFX12-NEXT: s_endpgm
573 %ld = load <8 x i64>, ptr addrspace(4) %in
574 store <8 x i64> %ld, ptr addrspace(1) %out
578 define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
579 ; GFX6-LABEL: constant_load_v16i64:
580 ; GFX6: ; %bb.0: ; %entry
581 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
582 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
583 ; GFX6-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10
584 ; GFX6-NEXT: s_mov_b32 s39, 0xf000
585 ; GFX6-NEXT: s_mov_b32 s38, -1
586 ; GFX6-NEXT: s_mov_b32 s36, s0
587 ; GFX6-NEXT: s_mov_b32 s37, s1
588 ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
589 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
590 ; GFX6-NEXT: v_mov_b32_e32 v0, s28
591 ; GFX6-NEXT: v_mov_b32_e32 v1, s29
592 ; GFX6-NEXT: v_mov_b32_e32 v2, s30
593 ; GFX6-NEXT: v_mov_b32_e32 v3, s31
594 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112
595 ; GFX6-NEXT: s_waitcnt expcnt(0)
596 ; GFX6-NEXT: v_mov_b32_e32 v0, s24
597 ; GFX6-NEXT: v_mov_b32_e32 v1, s25
598 ; GFX6-NEXT: v_mov_b32_e32 v2, s26
599 ; GFX6-NEXT: v_mov_b32_e32 v3, s27
600 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96
601 ; GFX6-NEXT: s_waitcnt expcnt(0)
602 ; GFX6-NEXT: v_mov_b32_e32 v0, s20
603 ; GFX6-NEXT: v_mov_b32_e32 v1, s21
604 ; GFX6-NEXT: v_mov_b32_e32 v2, s22
605 ; GFX6-NEXT: v_mov_b32_e32 v3, s23
606 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
607 ; GFX6-NEXT: s_waitcnt expcnt(0)
608 ; GFX6-NEXT: v_mov_b32_e32 v0, s16
609 ; GFX6-NEXT: v_mov_b32_e32 v1, s17
610 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
611 ; GFX6-NEXT: v_mov_b32_e32 v3, s19
612 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64
613 ; GFX6-NEXT: s_waitcnt expcnt(0)
614 ; GFX6-NEXT: v_mov_b32_e32 v0, s12
615 ; GFX6-NEXT: v_mov_b32_e32 v1, s13
616 ; GFX6-NEXT: v_mov_b32_e32 v2, s14
617 ; GFX6-NEXT: v_mov_b32_e32 v3, s15
618 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48
619 ; GFX6-NEXT: s_waitcnt expcnt(0)
620 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
621 ; GFX6-NEXT: v_mov_b32_e32 v1, s9
622 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
623 ; GFX6-NEXT: v_mov_b32_e32 v3, s11
624 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32
625 ; GFX6-NEXT: s_waitcnt expcnt(0)
626 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
627 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
628 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
629 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
630 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
631 ; GFX6-NEXT: s_waitcnt expcnt(0)
632 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
633 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
634 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
635 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
636 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0
637 ; GFX6-NEXT: s_endpgm
639 ; GFX7-LABEL: constant_load_v16i64:
640 ; GFX7: ; %bb.0: ; %entry
641 ; GFX7-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
642 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
643 ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
644 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
645 ; GFX7-NEXT: s_add_u32 s34, s36, 0x70
646 ; GFX7-NEXT: s_addc_u32 s35, s37, 0
647 ; GFX7-NEXT: v_mov_b32_e32 v5, s34
648 ; GFX7-NEXT: v_mov_b32_e32 v6, s35
649 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX7-NEXT: v_mov_b32_e32 v0, s28
651 ; GFX7-NEXT: v_mov_b32_e32 v1, s29
652 ; GFX7-NEXT: v_mov_b32_e32 v2, s30
653 ; GFX7-NEXT: v_mov_b32_e32 v3, s31
654 ; GFX7-NEXT: v_mov_b32_e32 v4, s24
655 ; GFX7-NEXT: s_add_u32 s24, s36, 0x60
656 ; GFX7-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
657 ; GFX7-NEXT: v_mov_b32_e32 v5, s25
658 ; GFX7-NEXT: s_addc_u32 s25, s37, 0
659 ; GFX7-NEXT: v_mov_b32_e32 v0, s24
660 ; GFX7-NEXT: v_mov_b32_e32 v6, s26
661 ; GFX7-NEXT: v_mov_b32_e32 v7, s27
662 ; GFX7-NEXT: v_mov_b32_e32 v1, s25
663 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
664 ; GFX7-NEXT: v_mov_b32_e32 v0, s20
665 ; GFX7-NEXT: s_add_u32 s20, s36, 0x50
666 ; GFX7-NEXT: v_mov_b32_e32 v1, s21
667 ; GFX7-NEXT: s_addc_u32 s21, s37, 0
668 ; GFX7-NEXT: v_mov_b32_e32 v4, s20
669 ; GFX7-NEXT: v_mov_b32_e32 v2, s22
670 ; GFX7-NEXT: v_mov_b32_e32 v3, s23
671 ; GFX7-NEXT: v_mov_b32_e32 v5, s21
672 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
674 ; GFX7-NEXT: v_mov_b32_e32 v0, s16
675 ; GFX7-NEXT: s_add_u32 s16, s36, 64
676 ; GFX7-NEXT: v_mov_b32_e32 v1, s17
677 ; GFX7-NEXT: s_addc_u32 s17, s37, 0
678 ; GFX7-NEXT: v_mov_b32_e32 v4, s16
679 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
680 ; GFX7-NEXT: v_mov_b32_e32 v3, s19
681 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
682 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
684 ; GFX7-NEXT: v_mov_b32_e32 v0, s12
685 ; GFX7-NEXT: s_add_u32 s12, s36, 48
686 ; GFX7-NEXT: v_mov_b32_e32 v1, s13
687 ; GFX7-NEXT: s_addc_u32 s13, s37, 0
688 ; GFX7-NEXT: v_mov_b32_e32 v4, s12
689 ; GFX7-NEXT: v_mov_b32_e32 v2, s14
690 ; GFX7-NEXT: v_mov_b32_e32 v3, s15
691 ; GFX7-NEXT: v_mov_b32_e32 v5, s13
692 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
694 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
695 ; GFX7-NEXT: s_add_u32 s8, s36, 32
696 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
697 ; GFX7-NEXT: s_addc_u32 s9, s37, 0
698 ; GFX7-NEXT: v_mov_b32_e32 v4, s8
699 ; GFX7-NEXT: v_mov_b32_e32 v2, s10
700 ; GFX7-NEXT: v_mov_b32_e32 v3, s11
701 ; GFX7-NEXT: v_mov_b32_e32 v5, s9
702 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
704 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
705 ; GFX7-NEXT: s_add_u32 s4, s36, 16
706 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
707 ; GFX7-NEXT: s_addc_u32 s5, s37, 0
708 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
709 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
710 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
711 ; GFX7-NEXT: v_mov_b32_e32 v5, s5
712 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
713 ; GFX7-NEXT: v_mov_b32_e32 v4, s36
714 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
715 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
716 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
717 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
718 ; GFX7-NEXT: v_mov_b32_e32 v5, s37
719 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
720 ; GFX7-NEXT: s_endpgm
722 ; GFX8-LABEL: constant_load_v16i64:
723 ; GFX8: ; %bb.0: ; %entry
724 ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
725 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
726 ; GFX8-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40
727 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
728 ; GFX8-NEXT: s_add_u32 s34, s36, 0x70
729 ; GFX8-NEXT: s_addc_u32 s35, s37, 0
730 ; GFX8-NEXT: v_mov_b32_e32 v5, s34
731 ; GFX8-NEXT: v_mov_b32_e32 v6, s35
732 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
733 ; GFX8-NEXT: v_mov_b32_e32 v0, s28
734 ; GFX8-NEXT: v_mov_b32_e32 v1, s29
735 ; GFX8-NEXT: v_mov_b32_e32 v2, s30
736 ; GFX8-NEXT: v_mov_b32_e32 v3, s31
737 ; GFX8-NEXT: v_mov_b32_e32 v4, s24
738 ; GFX8-NEXT: s_add_u32 s24, s36, 0x60
739 ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
740 ; GFX8-NEXT: v_mov_b32_e32 v5, s25
741 ; GFX8-NEXT: s_addc_u32 s25, s37, 0
742 ; GFX8-NEXT: v_mov_b32_e32 v0, s24
743 ; GFX8-NEXT: v_mov_b32_e32 v6, s26
744 ; GFX8-NEXT: v_mov_b32_e32 v7, s27
745 ; GFX8-NEXT: v_mov_b32_e32 v1, s25
746 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
747 ; GFX8-NEXT: v_mov_b32_e32 v0, s20
748 ; GFX8-NEXT: s_add_u32 s20, s36, 0x50
749 ; GFX8-NEXT: v_mov_b32_e32 v1, s21
750 ; GFX8-NEXT: s_addc_u32 s21, s37, 0
751 ; GFX8-NEXT: v_mov_b32_e32 v4, s20
752 ; GFX8-NEXT: v_mov_b32_e32 v2, s22
753 ; GFX8-NEXT: v_mov_b32_e32 v3, s23
754 ; GFX8-NEXT: v_mov_b32_e32 v5, s21
755 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
757 ; GFX8-NEXT: v_mov_b32_e32 v0, s16
758 ; GFX8-NEXT: s_add_u32 s16, s36, 64
759 ; GFX8-NEXT: v_mov_b32_e32 v1, s17
760 ; GFX8-NEXT: s_addc_u32 s17, s37, 0
761 ; GFX8-NEXT: v_mov_b32_e32 v4, s16
762 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
763 ; GFX8-NEXT: v_mov_b32_e32 v3, s19
764 ; GFX8-NEXT: v_mov_b32_e32 v5, s17
765 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
767 ; GFX8-NEXT: v_mov_b32_e32 v0, s12
768 ; GFX8-NEXT: s_add_u32 s12, s36, 48
769 ; GFX8-NEXT: v_mov_b32_e32 v1, s13
770 ; GFX8-NEXT: s_addc_u32 s13, s37, 0
771 ; GFX8-NEXT: v_mov_b32_e32 v4, s12
772 ; GFX8-NEXT: v_mov_b32_e32 v2, s14
773 ; GFX8-NEXT: v_mov_b32_e32 v3, s15
774 ; GFX8-NEXT: v_mov_b32_e32 v5, s13
775 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
777 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
778 ; GFX8-NEXT: s_add_u32 s8, s36, 32
779 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
780 ; GFX8-NEXT: s_addc_u32 s9, s37, 0
781 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
782 ; GFX8-NEXT: v_mov_b32_e32 v2, s10
783 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
784 ; GFX8-NEXT: v_mov_b32_e32 v5, s9
785 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
787 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
788 ; GFX8-NEXT: s_add_u32 s4, s36, 16
789 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
790 ; GFX8-NEXT: s_addc_u32 s5, s37, 0
791 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
792 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
793 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
794 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
795 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
796 ; GFX8-NEXT: v_mov_b32_e32 v4, s36
797 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
798 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
799 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
800 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
801 ; GFX8-NEXT: v_mov_b32_e32 v5, s37
802 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
803 ; GFX8-NEXT: s_endpgm
805 ; EG-LABEL: constant_load_v16i64:
806 ; EG: ; %bb.0: ; %entry
807 ; EG-NEXT: ALU 0, @42, KC0[CB0:0-32], KC1[]
809 ; EG-NEXT: ALU 3, @43, KC0[CB0:0-32], KC1[]
810 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
812 ; EG-NEXT: ALU 3, @47, KC0[CB0:0-32], KC1[]
813 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
815 ; EG-NEXT: ALU 3, @51, KC0[CB0:0-32], KC1[]
816 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
818 ; EG-NEXT: ALU 3, @55, KC0[CB0:0-32], KC1[]
819 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
821 ; EG-NEXT: ALU 3, @59, KC0[CB0:0-32], KC1[]
822 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
824 ; EG-NEXT: ALU 3, @63, KC0[CB0:0-32], KC1[]
825 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
827 ; EG-NEXT: ALU 3, @67, KC0[CB0:0-32], KC1[]
828 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
830 ; EG-NEXT: ALU 1, @71, KC0[CB0:0-32], KC1[]
831 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
833 ; EG-NEXT: Fetch clause starting at 26:
834 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 112, #1
835 ; EG-NEXT: Fetch clause starting at 28:
836 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 96, #1
837 ; EG-NEXT: Fetch clause starting at 30:
838 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 80, #1
839 ; EG-NEXT: Fetch clause starting at 32:
840 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 64, #1
841 ; EG-NEXT: Fetch clause starting at 34:
842 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
843 ; EG-NEXT: Fetch clause starting at 36:
844 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1
845 ; EG-NEXT: Fetch clause starting at 38:
846 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
847 ; EG-NEXT: Fetch clause starting at 40:
848 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
849 ; EG-NEXT: ALU clause starting at 42:
850 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
851 ; EG-NEXT: ALU clause starting at 43:
852 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
853 ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
854 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
855 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
856 ; EG-NEXT: ALU clause starting at 47:
857 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
858 ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
859 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
860 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
861 ; EG-NEXT: ALU clause starting at 51:
862 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
863 ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
864 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
865 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
866 ; EG-NEXT: ALU clause starting at 55:
867 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
868 ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
869 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
870 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
871 ; EG-NEXT: ALU clause starting at 59:
872 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
873 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
874 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
875 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
876 ; EG-NEXT: ALU clause starting at 63:
877 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
878 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
879 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
880 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
881 ; EG-NEXT: ALU clause starting at 67:
882 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
883 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
884 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
885 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
886 ; EG-NEXT: ALU clause starting at 71:
887 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
888 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
890 ; GFX12-LABEL: constant_load_v16i64:
891 ; GFX12: ; %bb.0: ; %entry
892 ; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
893 ; GFX12-NEXT: s_wait_kmcnt 0x0
894 ; GFX12-NEXT: s_clause 0x1
895 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
896 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
897 ; GFX12-NEXT: s_wait_kmcnt 0x0
898 ; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
899 ; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
900 ; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
901 ; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27
902 ; GFX12-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21
903 ; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
904 ; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17
905 ; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19
906 ; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13
907 ; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15
908 ; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9
909 ; GFX12-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11
910 ; GFX12-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5
911 ; GFX12-NEXT: v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7
912 ; GFX12-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1
913 ; GFX12-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3
914 ; GFX12-NEXT: v_mov_b32_e32 v30, s2
915 ; GFX12-NEXT: s_clause 0x7
916 ; GFX12-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112
917 ; GFX12-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96
918 ; GFX12-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80
919 ; GFX12-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64
920 ; GFX12-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48
921 ; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32
922 ; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
923 ; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37]
924 ; GFX12-NEXT: s_endpgm
926 %ld = load <16 x i64>, ptr addrspace(4) %in
927 store <16 x i64> %ld, ptr addrspace(1) %out
931 attributes #0 = { nounwind }