1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
5 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
8 define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
9 ; GFX6-LABEL: constant_load_i64:
11 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
12 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
14 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
15 ; GFX6-NEXT: s_mov_b32 s2, -1
16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
17 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
18 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
19 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
22 ; GFX7-LABEL: constant_load_i64:
24 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
25 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
26 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
27 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
28 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
29 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
30 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
31 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
32 ; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
35 ; GFX8-LABEL: constant_load_i64:
37 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
38 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
40 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
41 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
42 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
44 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
45 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
48 ; EG-LABEL: constant_load_i64:
50 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
52 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
53 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
56 ; EG-NEXT: Fetch clause starting at 6:
57 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
58 ; EG-NEXT: ALU clause starting at 8:
59 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
60 ; EG-NEXT: ALU clause starting at 9:
61 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
62 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
64 ; GFX12-LABEL: constant_load_i64:
66 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
67 ; GFX12-NEXT: s_wait_kmcnt 0x0
68 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
69 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
70 ; GFX12-NEXT: s_wait_kmcnt 0x0
71 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
72 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
74 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
75 ; GFX12-NEXT: s_endpgm
76 %ld = load i64, ptr addrspace(4) %in
77 store i64 %ld, ptr addrspace(1) %out
81 define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
82 ; GFX6-LABEL: constant_load_v2i64:
83 ; GFX6: ; %bb.0: ; %entry
84 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
85 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
86 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
87 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
88 ; GFX6-NEXT: s_mov_b32 s2, -1
89 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
90 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
91 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
92 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
93 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
94 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
97 ; GFX7-LABEL: constant_load_v2i64:
98 ; GFX7: ; %bb.0: ; %entry
99 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
100 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
101 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
102 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
103 ; GFX7-NEXT: v_mov_b32_e32 v5, s1
104 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
105 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
106 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
107 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
108 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
109 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
110 ; GFX7-NEXT: s_endpgm
112 ; GFX8-LABEL: constant_load_v2i64:
113 ; GFX8: ; %bb.0: ; %entry
114 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
115 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
116 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
117 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
118 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
119 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
121 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
122 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
123 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
124 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
125 ; GFX8-NEXT: s_endpgm
127 ; EG-LABEL: constant_load_v2i64:
128 ; EG: ; %bb.0: ; %entry
129 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
131 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
132 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
135 ; EG-NEXT: Fetch clause starting at 6:
136 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
137 ; EG-NEXT: ALU clause starting at 8:
138 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
139 ; EG-NEXT: ALU clause starting at 9:
140 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
141 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
143 ; GFX12-LABEL: constant_load_v2i64:
144 ; GFX12: ; %bb.0: ; %entry
145 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
146 ; GFX12-NEXT: s_wait_kmcnt 0x0
147 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
148 ; GFX12-NEXT: v_mov_b32_e32 v4, 0
149 ; GFX12-NEXT: s_wait_kmcnt 0x0
150 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
151 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
152 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
153 ; GFX12-NEXT: s_nop 0
154 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
155 ; GFX12-NEXT: s_endpgm
157 %ld = load <2 x i64>, ptr addrspace(4) %in
158 store <2 x i64> %ld, ptr addrspace(1) %out
162 define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
163 ; GFX6-LABEL: constant_load_v3i64:
164 ; GFX6: ; %bb.0: ; %entry
165 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
166 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
168 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
169 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
170 ; GFX6-NEXT: s_mov_b32 s2, -1
171 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
172 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
173 ; GFX6-NEXT: v_mov_b32_e32 v1, s9
174 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
175 ; GFX6-NEXT: s_waitcnt expcnt(0)
176 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
177 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
178 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
179 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
180 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
181 ; GFX6-NEXT: s_endpgm
183 ; GFX7-LABEL: constant_load_v3i64:
184 ; GFX7: ; %bb.0: ; %entry
185 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
186 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
187 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
188 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
189 ; GFX7-NEXT: s_add_u32 s2, s0, 16
190 ; GFX7-NEXT: s_addc_u32 s3, s1, 0
191 ; GFX7-NEXT: v_mov_b32_e32 v4, s3
192 ; GFX7-NEXT: v_mov_b32_e32 v3, s2
193 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX7-NEXT: v_mov_b32_e32 v5, s8
195 ; GFX7-NEXT: v_mov_b32_e32 v6, s9
196 ; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
197 ; GFX7-NEXT: v_mov_b32_e32 v5, s1
198 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
199 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
200 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
201 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
202 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
203 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
204 ; GFX7-NEXT: s_endpgm
206 ; GFX8-LABEL: constant_load_v3i64:
207 ; GFX8: ; %bb.0: ; %entry
208 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
209 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10
211 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
212 ; GFX8-NEXT: s_add_u32 s2, s0, 16
213 ; GFX8-NEXT: s_addc_u32 s3, s1, 0
214 ; GFX8-NEXT: v_mov_b32_e32 v4, s3
215 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
216 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX8-NEXT: v_mov_b32_e32 v5, s8
218 ; GFX8-NEXT: v_mov_b32_e32 v6, s9
219 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
220 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
221 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
222 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
223 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
224 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
225 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
226 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
227 ; GFX8-NEXT: s_endpgm
229 ; EG-LABEL: constant_load_v3i64:
230 ; EG: ; %bb.0: ; %entry
231 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
233 ; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[]
234 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
236 ; EG-NEXT: ALU 3, @15, KC0[CB0:0-32], KC1[]
237 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
239 ; EG-NEXT: Fetch clause starting at 8:
240 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
241 ; EG-NEXT: Fetch clause starting at 10:
242 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
243 ; EG-NEXT: ALU clause starting at 12:
244 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
245 ; EG-NEXT: ALU clause starting at 13:
246 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
247 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
248 ; EG-NEXT: ALU clause starting at 15:
249 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
250 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
251 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
252 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
254 ; GFX12-LABEL: constant_load_v3i64:
255 ; GFX12: ; %bb.0: ; %entry
256 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
257 ; GFX12-NEXT: s_wait_kmcnt 0x0
258 ; GFX12-NEXT: s_clause 0x1
259 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10
260 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
261 ; GFX12-NEXT: s_wait_kmcnt 0x0
262 ; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
263 ; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
264 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
265 ; GFX12-NEXT: v_mov_b32_e32 v2, s6
266 ; GFX12-NEXT: s_clause 0x1
267 ; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
268 ; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
269 ; GFX12-NEXT: s_nop 0
270 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
271 ; GFX12-NEXT: s_endpgm
273 %ld = load <3 x i64>, ptr addrspace(4) %in
274 store <3 x i64> %ld, ptr addrspace(1) %out
278 define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
279 ; GFX6-LABEL: constant_load_v4i64:
280 ; GFX6: ; %bb.0: ; %entry
281 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
282 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
283 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
284 ; GFX6-NEXT: s_mov_b32 s11, 0xf000
285 ; GFX6-NEXT: s_mov_b32 s10, -1
286 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
287 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
288 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
289 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
290 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
291 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
292 ; GFX6-NEXT: s_waitcnt expcnt(0)
293 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
294 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
295 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
296 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
297 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
298 ; GFX6-NEXT: s_endpgm
300 ; GFX7-LABEL: constant_load_v4i64:
301 ; GFX7: ; %bb.0: ; %entry
302 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
303 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
304 ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
305 ; GFX7-NEXT: s_add_u32 s10, s8, 16
306 ; GFX7-NEXT: s_addc_u32 s11, s9, 0
307 ; GFX7-NEXT: v_mov_b32_e32 v6, s10
308 ; GFX7-NEXT: v_mov_b32_e32 v7, s11
309 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
310 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
311 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
312 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
313 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
314 ; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
315 ; GFX7-NEXT: v_mov_b32_e32 v4, s0
316 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
317 ; GFX7-NEXT: v_mov_b32_e32 v5, s1
318 ; GFX7-NEXT: v_mov_b32_e32 v6, s2
319 ; GFX7-NEXT: v_mov_b32_e32 v7, s3
320 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
321 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
322 ; GFX7-NEXT: s_endpgm
324 ; GFX8-LABEL: constant_load_v4i64:
325 ; GFX8: ; %bb.0: ; %entry
326 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
327 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
329 ; GFX8-NEXT: s_add_u32 s10, s8, 16
330 ; GFX8-NEXT: s_addc_u32 s11, s9, 0
331 ; GFX8-NEXT: v_mov_b32_e32 v6, s10
332 ; GFX8-NEXT: v_mov_b32_e32 v7, s11
333 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
334 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
335 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
336 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
337 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
338 ; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
339 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
340 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
341 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
342 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
343 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
344 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
345 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
346 ; GFX8-NEXT: s_endpgm
348 ; EG-LABEL: constant_load_v4i64:
349 ; EG: ; %bb.0: ; %entry
350 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
352 ; EG-NEXT: ALU 3, @13, KC0[CB0:0-32], KC1[]
353 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
355 ; EG-NEXT: ALU 1, @17, KC0[CB0:0-32], KC1[]
356 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
358 ; EG-NEXT: Fetch clause starting at 8:
359 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
360 ; EG-NEXT: Fetch clause starting at 10:
361 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
362 ; EG-NEXT: ALU clause starting at 12:
363 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
364 ; EG-NEXT: ALU clause starting at 13:
365 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
366 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
367 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
368 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
369 ; EG-NEXT: ALU clause starting at 17:
370 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
371 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
373 ; GFX12-LABEL: constant_load_v4i64:
374 ; GFX12: ; %bb.0: ; %entry
375 ; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24
376 ; GFX12-NEXT: s_wait_kmcnt 0x0
377 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
378 ; GFX12-NEXT: s_wait_kmcnt 0x0
379 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
380 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
381 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
382 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
383 ; GFX12-NEXT: v_mov_b32_e32 v6, s2
384 ; GFX12-NEXT: s_clause 0x1
385 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
386 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
387 ; GFX12-NEXT: s_nop 0
388 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
389 ; GFX12-NEXT: s_endpgm
391 %ld = load <4 x i64>, ptr addrspace(4) %in
392 store <4 x i64> %ld, ptr addrspace(1) %out
396 define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
397 ; GFX6-LABEL: constant_load_v8i64:
398 ; GFX6: ; %bb.0: ; %entry
399 ; GFX6-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9
400 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
402 ; GFX6-NEXT: s_mov_b32 s19, 0xf000
403 ; GFX6-NEXT: s_mov_b32 s18, -1
404 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
405 ; GFX6-NEXT: v_mov_b32_e32 v0, s12
406 ; GFX6-NEXT: v_mov_b32_e32 v1, s13
407 ; GFX6-NEXT: v_mov_b32_e32 v2, s14
408 ; GFX6-NEXT: v_mov_b32_e32 v3, s15
409 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
410 ; GFX6-NEXT: s_waitcnt expcnt(0)
411 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
412 ; GFX6-NEXT: v_mov_b32_e32 v1, s9
413 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
414 ; GFX6-NEXT: v_mov_b32_e32 v3, s11
415 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
416 ; GFX6-NEXT: s_waitcnt expcnt(0)
417 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
418 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
419 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
420 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
421 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
422 ; GFX6-NEXT: s_waitcnt expcnt(0)
423 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
424 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
425 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
426 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
427 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
428 ; GFX6-NEXT: s_endpgm
430 ; GFX7-LABEL: constant_load_v8i64:
431 ; GFX7: ; %bb.0: ; %entry
432 ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0
433 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
434 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
435 ; GFX7-NEXT: s_add_u32 s18, s16, 48
436 ; GFX7-NEXT: s_addc_u32 s19, s17, 0
437 ; GFX7-NEXT: v_mov_b32_e32 v6, s18
438 ; GFX7-NEXT: v_mov_b32_e32 v7, s19
439 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX7-NEXT: v_mov_b32_e32 v0, s12
441 ; GFX7-NEXT: v_mov_b32_e32 v1, s13
442 ; GFX7-NEXT: v_mov_b32_e32 v2, s14
443 ; GFX7-NEXT: v_mov_b32_e32 v3, s15
444 ; GFX7-NEXT: v_mov_b32_e32 v4, s8
445 ; GFX7-NEXT: s_add_u32 s8, s16, 32
446 ; GFX7-NEXT: v_mov_b32_e32 v5, s9
447 ; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
448 ; GFX7-NEXT: s_addc_u32 s9, s17, 0
449 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
450 ; GFX7-NEXT: v_mov_b32_e32 v6, s10
451 ; GFX7-NEXT: v_mov_b32_e32 v7, s11
452 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
453 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
454 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
455 ; GFX7-NEXT: s_add_u32 s4, s16, 16
456 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
457 ; GFX7-NEXT: s_addc_u32 s5, s17, 0
458 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
459 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
460 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
461 ; GFX7-NEXT: v_mov_b32_e32 v5, s5
462 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
463 ; GFX7-NEXT: v_mov_b32_e32 v4, s16
464 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
465 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
466 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
467 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
468 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
469 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
470 ; GFX7-NEXT: s_endpgm
472 ; GFX8-LABEL: constant_load_v8i64:
473 ; GFX8: ; %bb.0: ; %entry
474 ; GFX8-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
475 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
476 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
477 ; GFX8-NEXT: s_add_u32 s18, s16, 48
478 ; GFX8-NEXT: s_addc_u32 s19, s17, 0
479 ; GFX8-NEXT: v_mov_b32_e32 v6, s18
480 ; GFX8-NEXT: v_mov_b32_e32 v7, s19
481 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
482 ; GFX8-NEXT: v_mov_b32_e32 v0, s12
483 ; GFX8-NEXT: v_mov_b32_e32 v1, s13
484 ; GFX8-NEXT: v_mov_b32_e32 v2, s14
485 ; GFX8-NEXT: v_mov_b32_e32 v3, s15
486 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
487 ; GFX8-NEXT: s_add_u32 s8, s16, 32
488 ; GFX8-NEXT: v_mov_b32_e32 v5, s9
489 ; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
490 ; GFX8-NEXT: s_addc_u32 s9, s17, 0
491 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
492 ; GFX8-NEXT: v_mov_b32_e32 v6, s10
493 ; GFX8-NEXT: v_mov_b32_e32 v7, s11
494 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
495 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
496 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
497 ; GFX8-NEXT: s_add_u32 s4, s16, 16
498 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
499 ; GFX8-NEXT: s_addc_u32 s5, s17, 0
500 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
501 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
502 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
503 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
504 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
505 ; GFX8-NEXT: v_mov_b32_e32 v4, s16
506 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
507 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
508 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
509 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
510 ; GFX8-NEXT: v_mov_b32_e32 v5, s17
511 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
512 ; GFX8-NEXT: s_endpgm
514 ; EG-LABEL: constant_load_v8i64:
515 ; EG: ; %bb.0: ; %entry
516 ; EG-NEXT: ALU 0, @22, KC0[CB0:0-32], KC1[]
518 ; EG-NEXT: ALU 3, @23, KC0[CB0:0-32], KC1[]
519 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
521 ; EG-NEXT: ALU 3, @27, KC0[CB0:0-32], KC1[]
522 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
524 ; EG-NEXT: ALU 3, @31, KC0[CB0:0-32], KC1[]
525 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
527 ; EG-NEXT: ALU 1, @35, KC0[CB0:0-32], KC1[]
528 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
530 ; EG-NEXT: Fetch clause starting at 14:
531 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
532 ; EG-NEXT: Fetch clause starting at 16:
533 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1
534 ; EG-NEXT: Fetch clause starting at 18:
535 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
536 ; EG-NEXT: Fetch clause starting at 20:
537 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
538 ; EG-NEXT: ALU clause starting at 22:
539 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
540 ; EG-NEXT: ALU clause starting at 23:
541 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
542 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
543 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
544 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
545 ; EG-NEXT: ALU clause starting at 27:
546 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
547 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
548 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
549 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
550 ; EG-NEXT: ALU clause starting at 31:
551 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
552 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
553 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
554 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
555 ; EG-NEXT: ALU clause starting at 35:
556 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
557 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
559 ; GFX12-LABEL: constant_load_v8i64:
560 ; GFX12: ; %bb.0: ; %entry
561 ; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24
562 ; GFX12-NEXT: s_wait_kmcnt 0x0
563 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
564 ; GFX12-NEXT: s_wait_kmcnt 0x0
565 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
566 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
567 ; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
568 ; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11
569 ; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5
570 ; GFX12-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7
571 ; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1
572 ; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3
573 ; GFX12-NEXT: v_mov_b32_e32 v14, s2
574 ; GFX12-NEXT: s_clause 0x3
575 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48
576 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32
577 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
578 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17]
579 ; GFX12-NEXT: s_nop 0
580 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
581 ; GFX12-NEXT: s_endpgm
583 %ld = load <8 x i64>, ptr addrspace(4) %in
584 store <8 x i64> %ld, ptr addrspace(1) %out
588 define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
589 ; GFX6-LABEL: constant_load_v16i64:
590 ; GFX6: ; %bb.0: ; %entry
591 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
592 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
593 ; GFX6-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10
594 ; GFX6-NEXT: s_mov_b32 s39, 0xf000
595 ; GFX6-NEXT: s_mov_b32 s38, -1
596 ; GFX6-NEXT: s_mov_b32 s36, s0
597 ; GFX6-NEXT: s_mov_b32 s37, s1
598 ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
599 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
600 ; GFX6-NEXT: v_mov_b32_e32 v0, s28
601 ; GFX6-NEXT: v_mov_b32_e32 v1, s29
602 ; GFX6-NEXT: v_mov_b32_e32 v2, s30
603 ; GFX6-NEXT: v_mov_b32_e32 v3, s31
604 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112
605 ; GFX6-NEXT: s_waitcnt expcnt(0)
606 ; GFX6-NEXT: v_mov_b32_e32 v0, s24
607 ; GFX6-NEXT: v_mov_b32_e32 v1, s25
608 ; GFX6-NEXT: v_mov_b32_e32 v2, s26
609 ; GFX6-NEXT: v_mov_b32_e32 v3, s27
610 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96
611 ; GFX6-NEXT: s_waitcnt expcnt(0)
612 ; GFX6-NEXT: v_mov_b32_e32 v0, s20
613 ; GFX6-NEXT: v_mov_b32_e32 v1, s21
614 ; GFX6-NEXT: v_mov_b32_e32 v2, s22
615 ; GFX6-NEXT: v_mov_b32_e32 v3, s23
616 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
617 ; GFX6-NEXT: s_waitcnt expcnt(0)
618 ; GFX6-NEXT: v_mov_b32_e32 v0, s16
619 ; GFX6-NEXT: v_mov_b32_e32 v1, s17
620 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
621 ; GFX6-NEXT: v_mov_b32_e32 v3, s19
622 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64
623 ; GFX6-NEXT: s_waitcnt expcnt(0)
624 ; GFX6-NEXT: v_mov_b32_e32 v0, s12
625 ; GFX6-NEXT: v_mov_b32_e32 v1, s13
626 ; GFX6-NEXT: v_mov_b32_e32 v2, s14
627 ; GFX6-NEXT: v_mov_b32_e32 v3, s15
628 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48
629 ; GFX6-NEXT: s_waitcnt expcnt(0)
630 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
631 ; GFX6-NEXT: v_mov_b32_e32 v1, s9
632 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
633 ; GFX6-NEXT: v_mov_b32_e32 v3, s11
634 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32
635 ; GFX6-NEXT: s_waitcnt expcnt(0)
636 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
637 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
638 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
639 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
640 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
641 ; GFX6-NEXT: s_waitcnt expcnt(0)
642 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
643 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
644 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
645 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
646 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0
647 ; GFX6-NEXT: s_endpgm
649 ; GFX7-LABEL: constant_load_v16i64:
650 ; GFX7: ; %bb.0: ; %entry
651 ; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0
652 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
654 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
655 ; GFX7-NEXT: s_add_u32 s34, s36, 0x70
656 ; GFX7-NEXT: s_addc_u32 s35, s37, 0
657 ; GFX7-NEXT: v_mov_b32_e32 v5, s34
658 ; GFX7-NEXT: v_mov_b32_e32 v6, s35
659 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
660 ; GFX7-NEXT: v_mov_b32_e32 v0, s28
661 ; GFX7-NEXT: v_mov_b32_e32 v1, s29
662 ; GFX7-NEXT: v_mov_b32_e32 v2, s30
663 ; GFX7-NEXT: v_mov_b32_e32 v3, s31
664 ; GFX7-NEXT: v_mov_b32_e32 v4, s24
665 ; GFX7-NEXT: s_add_u32 s24, s36, 0x60
666 ; GFX7-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
667 ; GFX7-NEXT: v_mov_b32_e32 v5, s25
668 ; GFX7-NEXT: s_addc_u32 s25, s37, 0
669 ; GFX7-NEXT: v_mov_b32_e32 v0, s24
670 ; GFX7-NEXT: v_mov_b32_e32 v6, s26
671 ; GFX7-NEXT: v_mov_b32_e32 v7, s27
672 ; GFX7-NEXT: v_mov_b32_e32 v1, s25
673 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
674 ; GFX7-NEXT: v_mov_b32_e32 v0, s20
675 ; GFX7-NEXT: s_add_u32 s20, s36, 0x50
676 ; GFX7-NEXT: v_mov_b32_e32 v1, s21
677 ; GFX7-NEXT: s_addc_u32 s21, s37, 0
678 ; GFX7-NEXT: v_mov_b32_e32 v4, s20
679 ; GFX7-NEXT: v_mov_b32_e32 v2, s22
680 ; GFX7-NEXT: v_mov_b32_e32 v3, s23
681 ; GFX7-NEXT: v_mov_b32_e32 v5, s21
682 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
684 ; GFX7-NEXT: v_mov_b32_e32 v0, s16
685 ; GFX7-NEXT: s_add_u32 s16, s36, 64
686 ; GFX7-NEXT: v_mov_b32_e32 v1, s17
687 ; GFX7-NEXT: s_addc_u32 s17, s37, 0
688 ; GFX7-NEXT: v_mov_b32_e32 v4, s16
689 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
690 ; GFX7-NEXT: v_mov_b32_e32 v3, s19
691 ; GFX7-NEXT: v_mov_b32_e32 v5, s17
692 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
694 ; GFX7-NEXT: v_mov_b32_e32 v0, s12
695 ; GFX7-NEXT: s_add_u32 s12, s36, 48
696 ; GFX7-NEXT: v_mov_b32_e32 v1, s13
697 ; GFX7-NEXT: s_addc_u32 s13, s37, 0
698 ; GFX7-NEXT: v_mov_b32_e32 v4, s12
699 ; GFX7-NEXT: v_mov_b32_e32 v2, s14
700 ; GFX7-NEXT: v_mov_b32_e32 v3, s15
701 ; GFX7-NEXT: v_mov_b32_e32 v5, s13
702 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
704 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
705 ; GFX7-NEXT: s_add_u32 s8, s36, 32
706 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
707 ; GFX7-NEXT: s_addc_u32 s9, s37, 0
708 ; GFX7-NEXT: v_mov_b32_e32 v4, s8
709 ; GFX7-NEXT: v_mov_b32_e32 v2, s10
710 ; GFX7-NEXT: v_mov_b32_e32 v3, s11
711 ; GFX7-NEXT: v_mov_b32_e32 v5, s9
712 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
714 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
715 ; GFX7-NEXT: s_add_u32 s4, s36, 16
716 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
717 ; GFX7-NEXT: s_addc_u32 s5, s37, 0
718 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
719 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
720 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
721 ; GFX7-NEXT: v_mov_b32_e32 v5, s5
722 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
723 ; GFX7-NEXT: v_mov_b32_e32 v4, s36
724 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
725 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
726 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
727 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
728 ; GFX7-NEXT: v_mov_b32_e32 v5, s37
729 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
730 ; GFX7-NEXT: s_endpgm
732 ; GFX8-LABEL: constant_load_v16i64:
733 ; GFX8: ; %bb.0: ; %entry
734 ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24
735 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
736 ; GFX8-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40
737 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
738 ; GFX8-NEXT: s_add_u32 s34, s36, 0x70
739 ; GFX8-NEXT: s_addc_u32 s35, s37, 0
740 ; GFX8-NEXT: v_mov_b32_e32 v5, s34
741 ; GFX8-NEXT: v_mov_b32_e32 v6, s35
742 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
743 ; GFX8-NEXT: v_mov_b32_e32 v0, s28
744 ; GFX8-NEXT: v_mov_b32_e32 v1, s29
745 ; GFX8-NEXT: v_mov_b32_e32 v2, s30
746 ; GFX8-NEXT: v_mov_b32_e32 v3, s31
747 ; GFX8-NEXT: v_mov_b32_e32 v4, s24
748 ; GFX8-NEXT: s_add_u32 s24, s36, 0x60
749 ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
750 ; GFX8-NEXT: v_mov_b32_e32 v5, s25
751 ; GFX8-NEXT: s_addc_u32 s25, s37, 0
752 ; GFX8-NEXT: v_mov_b32_e32 v0, s24
753 ; GFX8-NEXT: v_mov_b32_e32 v6, s26
754 ; GFX8-NEXT: v_mov_b32_e32 v7, s27
755 ; GFX8-NEXT: v_mov_b32_e32 v1, s25
756 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
757 ; GFX8-NEXT: v_mov_b32_e32 v0, s20
758 ; GFX8-NEXT: s_add_u32 s20, s36, 0x50
759 ; GFX8-NEXT: v_mov_b32_e32 v1, s21
760 ; GFX8-NEXT: s_addc_u32 s21, s37, 0
761 ; GFX8-NEXT: v_mov_b32_e32 v4, s20
762 ; GFX8-NEXT: v_mov_b32_e32 v2, s22
763 ; GFX8-NEXT: v_mov_b32_e32 v3, s23
764 ; GFX8-NEXT: v_mov_b32_e32 v5, s21
765 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
767 ; GFX8-NEXT: v_mov_b32_e32 v0, s16
768 ; GFX8-NEXT: s_add_u32 s16, s36, 64
769 ; GFX8-NEXT: v_mov_b32_e32 v1, s17
770 ; GFX8-NEXT: s_addc_u32 s17, s37, 0
771 ; GFX8-NEXT: v_mov_b32_e32 v4, s16
772 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
773 ; GFX8-NEXT: v_mov_b32_e32 v3, s19
774 ; GFX8-NEXT: v_mov_b32_e32 v5, s17
775 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
777 ; GFX8-NEXT: v_mov_b32_e32 v0, s12
778 ; GFX8-NEXT: s_add_u32 s12, s36, 48
779 ; GFX8-NEXT: v_mov_b32_e32 v1, s13
780 ; GFX8-NEXT: s_addc_u32 s13, s37, 0
781 ; GFX8-NEXT: v_mov_b32_e32 v4, s12
782 ; GFX8-NEXT: v_mov_b32_e32 v2, s14
783 ; GFX8-NEXT: v_mov_b32_e32 v3, s15
784 ; GFX8-NEXT: v_mov_b32_e32 v5, s13
785 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
787 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
788 ; GFX8-NEXT: s_add_u32 s8, s36, 32
789 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
790 ; GFX8-NEXT: s_addc_u32 s9, s37, 0
791 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
792 ; GFX8-NEXT: v_mov_b32_e32 v2, s10
793 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
794 ; GFX8-NEXT: v_mov_b32_e32 v5, s9
795 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
797 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
798 ; GFX8-NEXT: s_add_u32 s4, s36, 16
799 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
800 ; GFX8-NEXT: s_addc_u32 s5, s37, 0
801 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
802 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
803 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
804 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
805 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
806 ; GFX8-NEXT: v_mov_b32_e32 v4, s36
807 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
808 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
809 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
810 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
811 ; GFX8-NEXT: v_mov_b32_e32 v5, s37
812 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
813 ; GFX8-NEXT: s_endpgm
815 ; EG-LABEL: constant_load_v16i64:
816 ; EG: ; %bb.0: ; %entry
817 ; EG-NEXT: ALU 0, @42, KC0[CB0:0-32], KC1[]
819 ; EG-NEXT: ALU 3, @43, KC0[CB0:0-32], KC1[]
820 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
822 ; EG-NEXT: ALU 3, @47, KC0[CB0:0-32], KC1[]
823 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
825 ; EG-NEXT: ALU 3, @51, KC0[CB0:0-32], KC1[]
826 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
828 ; EG-NEXT: ALU 3, @55, KC0[CB0:0-32], KC1[]
829 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
831 ; EG-NEXT: ALU 3, @59, KC0[CB0:0-32], KC1[]
832 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
834 ; EG-NEXT: ALU 3, @63, KC0[CB0:0-32], KC1[]
835 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
837 ; EG-NEXT: ALU 3, @67, KC0[CB0:0-32], KC1[]
838 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
840 ; EG-NEXT: ALU 1, @71, KC0[CB0:0-32], KC1[]
841 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
843 ; EG-NEXT: Fetch clause starting at 26:
844 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 112, #1
845 ; EG-NEXT: Fetch clause starting at 28:
846 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 96, #1
847 ; EG-NEXT: Fetch clause starting at 30:
848 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 80, #1
849 ; EG-NEXT: Fetch clause starting at 32:
850 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 64, #1
851 ; EG-NEXT: Fetch clause starting at 34:
852 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
853 ; EG-NEXT: Fetch clause starting at 36:
854 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1
855 ; EG-NEXT: Fetch clause starting at 38:
856 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
857 ; EG-NEXT: Fetch clause starting at 40:
858 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
859 ; EG-NEXT: ALU clause starting at 42:
860 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
861 ; EG-NEXT: ALU clause starting at 43:
862 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
863 ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
864 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
865 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
866 ; EG-NEXT: ALU clause starting at 47:
867 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
868 ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
869 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
870 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
871 ; EG-NEXT: ALU clause starting at 51:
872 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
873 ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
874 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
875 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
876 ; EG-NEXT: ALU clause starting at 55:
877 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
878 ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
879 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
880 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
881 ; EG-NEXT: ALU clause starting at 59:
882 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
883 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
884 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
885 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
886 ; EG-NEXT: ALU clause starting at 63:
887 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
888 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
889 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
890 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
891 ; EG-NEXT: ALU clause starting at 67:
892 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
893 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
894 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
895 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
896 ; EG-NEXT: ALU clause starting at 71:
897 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
898 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
900 ; GFX12-LABEL: constant_load_v16i64:
901 ; GFX12: ; %bb.0: ; %entry
902 ; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24
903 ; GFX12-NEXT: s_wait_kmcnt 0x0
904 ; GFX12-NEXT: s_clause 0x1
905 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
906 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
907 ; GFX12-NEXT: s_wait_kmcnt 0x0
908 ; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
909 ; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
910 ; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
911 ; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27
912 ; GFX12-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21
913 ; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
914 ; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17
915 ; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19
916 ; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13
917 ; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15
918 ; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9
919 ; GFX12-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11
920 ; GFX12-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5
921 ; GFX12-NEXT: v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7
922 ; GFX12-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1
923 ; GFX12-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3
924 ; GFX12-NEXT: v_mov_b32_e32 v30, s2
925 ; GFX12-NEXT: s_clause 0x7
926 ; GFX12-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112
927 ; GFX12-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96
928 ; GFX12-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80
929 ; GFX12-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64
930 ; GFX12-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48
931 ; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32
932 ; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
933 ; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37]
934 ; GFX12-NEXT: s_nop 0
935 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
936 ; GFX12-NEXT: s_endpgm
938 %ld = load <16 x i64>, ptr addrspace(4) %in
939 store <16 x i64> %ld, ptr addrspace(1) %out
943 attributes #0 = { nounwind }