1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
16 define amdgpu_kernel void @private_nontemporal_load_0(
17 ; GFX6-LABEL: private_nontemporal_load_0:
18 ; GFX6: ; %bb.0: ; %entry
19 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
20 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
21 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
22 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
23 ; GFX6-NEXT: s_add_u32 s8, s8, s7
24 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
25 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
26 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
27 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
28 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
29 ; GFX6-NEXT: s_mov_b32 s2, -1
30 ; GFX6-NEXT: s_waitcnt vmcnt(0)
31 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
34 ; GFX7-LABEL: private_nontemporal_load_0:
35 ; GFX7: ; %bb.0: ; %entry
36 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
37 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
38 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
39 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
40 ; GFX7-NEXT: s_add_u32 s8, s8, s7
41 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
42 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
44 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
45 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
46 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
47 ; GFX7-NEXT: s_waitcnt vmcnt(0)
48 ; GFX7-NEXT: flat_store_dword v[0:1], v2
51 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
52 ; GFX10-WGP: ; %bb.0: ; %entry
53 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
54 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
55 ; GFX10-WGP-NEXT: s_clause 0x1
56 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
57 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
58 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
59 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
60 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
61 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
62 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
63 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
64 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
65 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
66 ; GFX10-WGP-NEXT: s_endpgm
68 ; GFX10-CU-LABEL: private_nontemporal_load_0:
69 ; GFX10-CU: ; %bb.0: ; %entry
70 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
71 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
72 ; GFX10-CU-NEXT: s_clause 0x1
73 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
74 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
75 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
76 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
77 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
78 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
79 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
80 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
81 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
82 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
83 ; GFX10-CU-NEXT: s_endpgm
85 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
86 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
87 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
88 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
89 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
90 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
91 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
92 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
93 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
94 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
95 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
96 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
97 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
98 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
99 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
100 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
101 ; SKIP-CACHE-INV-NEXT: s_endpgm
103 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
104 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
105 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
106 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
107 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
108 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
109 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
110 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
111 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
112 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
114 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
115 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
116 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
117 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
119 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
120 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
121 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
122 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
123 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
124 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
125 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
126 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
127 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
128 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
130 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
131 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
132 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
133 ; GFX90A-TGSPLIT-NEXT: s_endpgm
135 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
136 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
137 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
138 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
139 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
140 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
142 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
143 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
144 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
146 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
147 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
148 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
149 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
150 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
151 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
153 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
154 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
155 ; GFX940-TGSPLIT-NEXT: s_endpgm
157 ; GFX11-WGP-LABEL: private_nontemporal_load_0:
158 ; GFX11-WGP: ; %bb.0: ; %entry
159 ; GFX11-WGP-NEXT: s_clause 0x1
160 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
161 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
162 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
163 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
164 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc
165 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
166 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
167 ; GFX11-WGP-NEXT: s_nop 0
168 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
169 ; GFX11-WGP-NEXT: s_endpgm
171 ; GFX11-CU-LABEL: private_nontemporal_load_0:
172 ; GFX11-CU: ; %bb.0: ; %entry
173 ; GFX11-CU-NEXT: s_clause 0x1
174 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
175 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
176 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
177 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
178 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc
179 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
180 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
181 ; GFX11-CU-NEXT: s_nop 0
182 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
183 ; GFX11-CU-NEXT: s_endpgm
185 ; GFX12-WGP-LABEL: private_nontemporal_load_0:
186 ; GFX12-WGP: ; %bb.0: ; %entry
187 ; GFX12-WGP-NEXT: s_clause 0x1
188 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
189 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
190 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, 0
191 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
192 ; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT
193 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
194 ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
195 ; GFX12-WGP-NEXT: s_nop 0
196 ; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
197 ; GFX12-WGP-NEXT: s_endpgm
199 ; GFX12-CU-LABEL: private_nontemporal_load_0:
200 ; GFX12-CU: ; %bb.0: ; %entry
201 ; GFX12-CU-NEXT: s_clause 0x1
202 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
203 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
204 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, 0
205 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
206 ; GFX12-CU-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_NT
207 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
208 ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1]
209 ; GFX12-CU-NEXT: s_nop 0
210 ; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
211 ; GFX12-CU-NEXT: s_endpgm
212 ptr addrspace(5) %in, ptr addrspace(1) %out) {
214 %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
215 store i32 %val, ptr addrspace(1) %out
219 define amdgpu_kernel void @private_nontemporal_load_1(
220 ; GFX6-LABEL: private_nontemporal_load_1:
221 ; GFX6: ; %bb.0: ; %entry
222 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
223 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
224 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
225 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
226 ; GFX6-NEXT: s_add_u32 s8, s8, s7
227 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
228 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
229 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
230 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
231 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
232 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
233 ; GFX6-NEXT: s_mov_b32 s2, -1
234 ; GFX6-NEXT: s_waitcnt vmcnt(0)
235 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
236 ; GFX6-NEXT: s_endpgm
238 ; GFX7-LABEL: private_nontemporal_load_1:
239 ; GFX7: ; %bb.0: ; %entry
240 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
241 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
242 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
243 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
244 ; GFX7-NEXT: s_add_u32 s8, s8, s7
245 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
246 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
247 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
249 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
250 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
251 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
252 ; GFX7-NEXT: s_waitcnt vmcnt(0)
253 ; GFX7-NEXT: flat_store_dword v[0:1], v2
254 ; GFX7-NEXT: s_endpgm
256 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
257 ; GFX10-WGP: ; %bb.0: ; %entry
258 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
259 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
260 ; GFX10-WGP-NEXT: s_clause 0x1
261 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
262 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
263 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
264 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
265 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
266 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
268 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
269 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
270 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
271 ; GFX10-WGP-NEXT: s_endpgm
273 ; GFX10-CU-LABEL: private_nontemporal_load_1:
274 ; GFX10-CU: ; %bb.0: ; %entry
275 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
276 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
277 ; GFX10-CU-NEXT: s_clause 0x1
278 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
279 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
280 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
281 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
282 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
283 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
285 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
286 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
287 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
288 ; GFX10-CU-NEXT: s_endpgm
290 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
291 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
292 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
293 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
294 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
295 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
296 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
297 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
298 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
299 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
300 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
301 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
302 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
303 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
304 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
305 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
306 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
307 ; SKIP-CACHE-INV-NEXT: s_endpgm
309 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
310 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
311 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
312 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
313 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
314 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
315 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
316 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
317 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
318 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
319 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
320 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
321 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
322 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
323 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
325 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
326 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
327 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
328 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
329 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
330 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
331 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
332 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
333 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
334 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
335 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
336 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
337 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
338 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
339 ; GFX90A-TGSPLIT-NEXT: s_endpgm
341 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
342 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
343 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
344 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
345 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
346 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
348 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
349 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
350 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
351 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
353 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
354 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
355 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
356 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
357 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
358 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
359 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
360 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
361 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
362 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
363 ; GFX940-TGSPLIT-NEXT: s_endpgm
365 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
366 ; GFX11-WGP: ; %bb.0: ; %entry
367 ; GFX11-WGP-NEXT: s_clause 0x1
368 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
369 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
370 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
371 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
373 ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off slc dlc
374 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
375 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
376 ; GFX11-WGP-NEXT: s_nop 0
377 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
378 ; GFX11-WGP-NEXT: s_endpgm
380 ; GFX11-CU-LABEL: private_nontemporal_load_1:
381 ; GFX11-CU: ; %bb.0: ; %entry
382 ; GFX11-CU-NEXT: s_clause 0x1
383 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
384 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
385 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
386 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
387 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
388 ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off slc dlc
389 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
390 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
391 ; GFX11-CU-NEXT: s_nop 0
392 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
393 ; GFX11-CU-NEXT: s_endpgm
395 ; GFX12-WGP-LABEL: private_nontemporal_load_1:
396 ; GFX12-WGP: ; %bb.0: ; %entry
397 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
398 ; GFX12-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
399 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
400 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
401 ; GFX12-WGP-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT
402 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
403 ; GFX12-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
404 ; GFX12-WGP-NEXT: s_nop 0
405 ; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
406 ; GFX12-WGP-NEXT: s_endpgm
408 ; GFX12-CU-LABEL: private_nontemporal_load_1:
409 ; GFX12-CU: ; %bb.0: ; %entry
410 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
411 ; GFX12-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
412 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
413 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
414 ; GFX12-CU-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT
415 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
416 ; GFX12-CU-NEXT: global_store_b32 v1, v0, s[0:1]
417 ; GFX12-CU-NEXT: s_nop 0
418 ; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
419 ; GFX12-CU-NEXT: s_endpgm
420 ptr addrspace(5) %in, ptr addrspace(1) %out) {
422 %tid = call i32 @llvm.amdgcn.workitem.id.x()
423 %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
424 %val = load i32, ptr addrspace(5) %val.gep, align 4, !nontemporal !0
425 store i32 %val, ptr addrspace(1) %out
429 define amdgpu_kernel void @private_nontemporal_store_0(
430 ; GFX6-LABEL: private_nontemporal_store_0:
431 ; GFX6: ; %bb.0: ; %entry
432 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
433 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
434 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
435 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
436 ; GFX6-NEXT: s_add_u32 s8, s8, s7
437 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
438 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
439 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
440 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
441 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
443 ; GFX6-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
444 ; GFX6-NEXT: s_endpgm
446 ; GFX7-LABEL: private_nontemporal_store_0:
447 ; GFX7: ; %bb.0: ; %entry
448 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
449 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
450 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
451 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
452 ; GFX7-NEXT: s_add_u32 s8, s8, s7
453 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
454 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
455 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
456 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
457 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
458 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
459 ; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
460 ; GFX7-NEXT: s_endpgm
462 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
463 ; GFX10-WGP: ; %bb.0: ; %entry
464 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
465 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
466 ; GFX10-WGP-NEXT: s_clause 0x1
467 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
468 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
469 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
470 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
471 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
473 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
474 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
476 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
477 ; GFX10-WGP-NEXT: s_endpgm
479 ; GFX10-CU-LABEL: private_nontemporal_store_0:
480 ; GFX10-CU: ; %bb.0: ; %entry
481 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
482 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
483 ; GFX10-CU-NEXT: s_clause 0x1
484 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
485 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
486 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
487 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
488 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
490 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
491 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
493 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
494 ; GFX10-CU-NEXT: s_endpgm
496 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
497 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
498 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
499 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
500 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
501 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
502 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
503 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
504 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
505 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
506 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
507 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
508 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
509 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
510 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
511 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
512 ; SKIP-CACHE-INV-NEXT: s_endpgm
514 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
515 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
516 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
517 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
518 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
519 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
520 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
521 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
522 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
523 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
524 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
525 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
526 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
527 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
528 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
530 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
531 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
532 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
533 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
534 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
535 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
536 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
537 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
538 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
539 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
540 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
541 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
542 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
543 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
544 ; GFX90A-TGSPLIT-NEXT: s_endpgm
546 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
547 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
548 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
549 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
550 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
551 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
552 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
553 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
554 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s4 sc0 nt sc1
555 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
557 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
558 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
559 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
560 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
561 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
563 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
564 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
565 ; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 sc0 nt sc1
566 ; GFX940-TGSPLIT-NEXT: s_endpgm
568 ; GFX11-WGP-LABEL: private_nontemporal_store_0:
569 ; GFX11-WGP: ; %bb.0: ; %entry
570 ; GFX11-WGP-NEXT: s_clause 0x1
571 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
572 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
573 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
574 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
575 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
576 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
577 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
578 ; GFX11-WGP-NEXT: s_endpgm
580 ; GFX11-CU-LABEL: private_nontemporal_store_0:
581 ; GFX11-CU: ; %bb.0: ; %entry
582 ; GFX11-CU-NEXT: s_clause 0x1
583 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
584 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
585 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
587 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
588 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
589 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
590 ; GFX11-CU-NEXT: s_endpgm
592 ; GFX12-WGP-LABEL: private_nontemporal_store_0:
593 ; GFX12-WGP: ; %bb.0: ; %entry
594 ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
595 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
596 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
597 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
598 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
599 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT
600 ; GFX12-WGP-NEXT: s_endpgm
602 ; GFX12-CU-LABEL: private_nontemporal_store_0:
603 ; GFX12-CU: ; %bb.0: ; %entry
604 ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
605 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
606 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
607 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
608 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
609 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s2 th:TH_STORE_NT
610 ; GFX12-CU-NEXT: s_endpgm
611 ptr addrspace(1) %in, ptr addrspace(5) %out) {
613 %val = load i32, ptr addrspace(1) %in, align 4
614 store i32 %val, ptr addrspace(5) %out, !nontemporal !0
618 define amdgpu_kernel void @private_nontemporal_store_1(
619 ; GFX6-LABEL: private_nontemporal_store_1:
620 ; GFX6: ; %bb.0: ; %entry
621 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
622 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
623 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
624 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
625 ; GFX6-NEXT: s_add_u32 s8, s8, s7
626 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
627 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
628 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
629 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
630 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
631 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
632 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
633 ; GFX6-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
634 ; GFX6-NEXT: s_endpgm
636 ; GFX7-LABEL: private_nontemporal_store_1:
637 ; GFX7: ; %bb.0: ; %entry
638 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
639 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
640 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
641 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
642 ; GFX7-NEXT: s_add_u32 s8, s8, s7
643 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
644 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
645 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
646 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
647 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
648 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
650 ; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
651 ; GFX7-NEXT: s_endpgm
653 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
654 ; GFX10-WGP: ; %bb.0: ; %entry
655 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
656 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
657 ; GFX10-WGP-NEXT: s_clause 0x1
658 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
659 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
660 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
661 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
662 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
663 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
664 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
665 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
666 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
667 ; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
668 ; GFX10-WGP-NEXT: s_endpgm
670 ; GFX10-CU-LABEL: private_nontemporal_store_1:
671 ; GFX10-CU: ; %bb.0: ; %entry
672 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
673 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
674 ; GFX10-CU-NEXT: s_clause 0x1
675 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
676 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
677 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
678 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
679 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
680 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
681 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
682 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
683 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
684 ; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
685 ; GFX10-CU-NEXT: s_endpgm
687 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
688 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
689 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
690 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
691 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
692 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
693 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
694 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
695 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
696 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
697 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
698 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
699 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
700 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
701 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
702 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
703 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
704 ; SKIP-CACHE-INV-NEXT: s_endpgm
706 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
707 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
708 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
709 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
710 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
711 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
712 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
713 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
714 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
715 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
716 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
717 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
718 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
719 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
720 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
722 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
723 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
724 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
725 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
726 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
727 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
728 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
729 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
730 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
731 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
732 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
733 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
735 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
736 ; GFX90A-TGSPLIT-NEXT: s_endpgm
738 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
739 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
740 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
741 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
742 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
743 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
744 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
745 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
747 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off sc0 nt sc1
748 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
750 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
751 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
752 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
753 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
754 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
755 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
756 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
757 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
758 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
759 ; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off sc0 nt sc1
760 ; GFX940-TGSPLIT-NEXT: s_endpgm
762 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
763 ; GFX11-WGP: ; %bb.0: ; %entry
764 ; GFX11-WGP-NEXT: s_clause 0x1
765 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
766 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
767 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
768 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
769 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
770 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
771 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
772 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off glc slc dlc
773 ; GFX11-WGP-NEXT: s_endpgm
775 ; GFX11-CU-LABEL: private_nontemporal_store_1:
776 ; GFX11-CU: ; %bb.0: ; %entry
777 ; GFX11-CU-NEXT: s_clause 0x1
778 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
779 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
780 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
781 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
782 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
783 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
784 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
785 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off glc slc dlc
786 ; GFX11-CU-NEXT: s_endpgm
788 ; GFX12-WGP-LABEL: private_nontemporal_store_1:
789 ; GFX12-WGP: ; %bb.0: ; %entry
790 ; GFX12-WGP-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
791 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
792 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
793 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
794 ; GFX12-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
795 ; GFX12-WGP-NEXT: scratch_store_b32 v0, v1, s2 th:TH_STORE_NT
796 ; GFX12-WGP-NEXT: s_endpgm
798 ; GFX12-CU-LABEL: private_nontemporal_store_1:
799 ; GFX12-CU: ; %bb.0: ; %entry
800 ; GFX12-CU-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
801 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
802 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
803 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
804 ; GFX12-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
805 ; GFX12-CU-NEXT: scratch_store_b32 v0, v1, s2 th:TH_STORE_NT
806 ; GFX12-CU-NEXT: s_endpgm
807 ptr addrspace(1) %in, ptr addrspace(5) %out) {
809 %tid = call i32 @llvm.amdgcn.workitem.id.x()
810 %val = load i32, ptr addrspace(1) %in, align 4
811 %out.gep = getelementptr inbounds i32, ptr addrspace(5) %out, i32 %tid
812 store i32 %val, ptr addrspace(5) %out.gep, !nontemporal !0
817 declare i32 @llvm.amdgcn.workitem.id.x()