1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
16 define amdgpu_kernel void @private_nontemporal_load_0(
17 ; GFX6-LABEL: private_nontemporal_load_0:
18 ; GFX6: ; %bb.0: ; %entry
19 ; GFX6-NEXT: s_add_u32 s0, s0, s15
20 ; GFX6-NEXT: s_addc_u32 s1, s1, 0
21 ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
22 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
23 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
24 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX6-NEXT: s_mov_b32 s11, s5
26 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
27 ; GFX6-NEXT: s_mov_b32 s9, 0x100f000
28 ; GFX6-NEXT: s_mov_b32 s10, -1
29 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
30 ; GFX6-NEXT: s_mov_b32 s5, s11
31 ; GFX6-NEXT: s_mov_b32 s6, s10
32 ; GFX6-NEXT: s_mov_b32 s7, s9
33 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
34 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc slc
35 ; GFX6-NEXT: s_waitcnt vmcnt(0)
36 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
39 ; GFX7-LABEL: private_nontemporal_load_0:
40 ; GFX7: ; %bb.0: ; %entry
41 ; GFX7-NEXT: s_add_u32 s0, s0, s15
42 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
43 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
44 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
45 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
47 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc
48 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
49 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
50 ; GFX7-NEXT: s_waitcnt vmcnt(0)
51 ; GFX7-NEXT: flat_store_dword v[0:1], v2
54 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
55 ; GFX10-WGP: ; %bb.0: ; %entry
56 ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
57 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
58 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
59 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
60 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
61 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
62 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
63 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc
64 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
65 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
66 ; GFX10-WGP-NEXT: s_endpgm
68 ; GFX10-CU-LABEL: private_nontemporal_load_0:
69 ; GFX10-CU: ; %bb.0: ; %entry
70 ; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
71 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
72 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
73 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
74 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
75 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
76 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
77 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc
78 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
79 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
80 ; GFX10-CU-NEXT: s_endpgm
82 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
83 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
84 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
85 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
86 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
87 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
88 ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
89 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
90 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
91 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
92 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
93 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
94 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
95 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
96 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
97 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
98 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
99 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
100 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
101 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
102 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
103 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
104 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
105 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
106 ; SKIP-CACHE-INV-NEXT: s_endpgm
108 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
109 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
110 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
111 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
112 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
113 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
114 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
115 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
116 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
117 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
118 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
119 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
120 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
122 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
123 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
124 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
125 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
126 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
127 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
128 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
129 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
130 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
131 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
132 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
133 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
134 ; GFX90A-TGSPLIT-NEXT: s_endpgm
136 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
137 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
138 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
139 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
140 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
141 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
142 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt
143 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
144 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
145 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
147 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
148 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
149 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
150 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
151 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
152 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt
154 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
155 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
156 ; GFX940-TGSPLIT-NEXT: s_endpgm
158 ; GFX11-WGP-LABEL: private_nontemporal_load_0:
159 ; GFX11-WGP: ; %bb.0: ; %entry
160 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
161 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
162 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
163 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
164 ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 slc dlc
165 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
166 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
167 ; GFX11-WGP-NEXT: s_endpgm
169 ; GFX11-CU-LABEL: private_nontemporal_load_0:
170 ; GFX11-CU: ; %bb.0: ; %entry
171 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
172 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
173 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
174 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
175 ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 slc dlc
176 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
177 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
178 ; GFX11-CU-NEXT: s_endpgm
180 ; GFX12-WGP-LABEL: private_nontemporal_load_0:
181 ; GFX12-WGP: ; %bb.0: ; %entry
182 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
183 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
184 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
185 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
186 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT
187 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
188 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
189 ; GFX12-WGP-NEXT: s_endpgm
191 ; GFX12-CU-LABEL: private_nontemporal_load_0:
192 ; GFX12-CU: ; %bb.0: ; %entry
193 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
194 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
195 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
196 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
197 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT
198 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
199 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
200 ; GFX12-CU-NEXT: s_endpgm
201 ptr addrspace(5) %in, ptr addrspace(1) %out) {
203 %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
204 store i32 %val, ptr addrspace(1) %out
208 define amdgpu_kernel void @private_nontemporal_load_1(
209 ; GFX6-LABEL: private_nontemporal_load_1:
210 ; GFX6: ; %bb.0: ; %entry
211 ; GFX6-NEXT: s_add_u32 s0, s0, s15
212 ; GFX6-NEXT: s_addc_u32 s1, s1, 0
213 ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
214 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
215 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
216 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX6-NEXT: s_mov_b32 s11, s5
218 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
219 ; GFX6-NEXT: s_mov_b32 s9, 0x100f000
220 ; GFX6-NEXT: s_mov_b32 s10, -1
221 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
222 ; GFX6-NEXT: s_mov_b32 s5, s11
223 ; GFX6-NEXT: s_mov_b32 s6, s10
224 ; GFX6-NEXT: s_mov_b32 s7, s9
225 ; GFX6-NEXT: s_mov_b32 s9, 2
226 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s9, v0
227 ; GFX6-NEXT: v_add_i32_e64 v0, s[8:9], s8, v0
228 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc slc
229 ; GFX6-NEXT: s_waitcnt vmcnt(0)
230 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
231 ; GFX6-NEXT: s_endpgm
233 ; GFX7-LABEL: private_nontemporal_load_1:
234 ; GFX7: ; %bb.0: ; %entry
235 ; GFX7-NEXT: s_add_u32 s0, s0, s15
236 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
237 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
238 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
239 ; GFX7-NEXT: s_mov_b32 s7, 2
240 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0
241 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
243 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc
244 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
245 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
246 ; GFX7-NEXT: s_waitcnt vmcnt(0)
247 ; GFX7-NEXT: flat_store_dword v[0:1], v2
248 ; GFX7-NEXT: s_endpgm
250 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
251 ; GFX10-WGP: ; %bb.0: ; %entry
252 ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
253 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
254 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
255 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
256 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
257 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
258 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2
259 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
260 ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7
261 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc
262 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
263 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
264 ; GFX10-WGP-NEXT: s_endpgm
266 ; GFX10-CU-LABEL: private_nontemporal_load_1:
267 ; GFX10-CU: ; %bb.0: ; %entry
268 ; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
269 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
270 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
271 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
272 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
273 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
274 ; GFX10-CU-NEXT: s_mov_b32 s6, 2
275 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7
277 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc
278 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
279 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
280 ; GFX10-CU-NEXT: s_endpgm
282 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
283 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
284 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
285 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
286 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
287 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
288 ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
289 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
290 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
291 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
292 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
293 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
294 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
295 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
296 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
297 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
298 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
299 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
300 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
301 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
302 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
303 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s5, v0
304 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0
305 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
306 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
307 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
308 ; SKIP-CACHE-INV-NEXT: s_endpgm
310 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
311 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
312 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
313 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
314 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
315 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
316 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
317 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
318 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
319 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
320 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2
321 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
323 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
324 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
325 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
326 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
327 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
329 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
330 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
331 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
332 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
333 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
334 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
335 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
336 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
337 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
338 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
339 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2
340 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
342 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
343 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
344 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
345 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
346 ; GFX90A-TGSPLIT-NEXT: s_endpgm
348 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
349 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
350 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
351 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0
352 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
353 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
354 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
355 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
356 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2
357 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
358 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
359 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
360 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, v1, off nt
361 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
362 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
363 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
365 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
366 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
367 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
368 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0
369 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
370 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
371 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
372 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
373 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 2
374 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
375 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
376 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
377 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, v1, off nt
378 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
379 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
380 ; GFX940-TGSPLIT-NEXT: s_endpgm
382 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
383 ; GFX11-WGP: ; %bb.0: ; %entry
384 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
385 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
386 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
387 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
388 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
389 ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2
390 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2
391 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
393 ; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off slc dlc
394 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
395 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
396 ; GFX11-WGP-NEXT: s_endpgm
398 ; GFX11-CU-LABEL: private_nontemporal_load_1:
399 ; GFX11-CU: ; %bb.0: ; %entry
400 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
401 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
402 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
403 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
404 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
405 ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2
406 ; GFX11-CU-NEXT: s_mov_b32 s2, 2
407 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
409 ; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off slc dlc
410 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
411 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
412 ; GFX11-CU-NEXT: s_endpgm
414 ; GFX12-WGP-LABEL: private_nontemporal_load_1:
415 ; GFX12-WGP: ; %bb.0: ; %entry
416 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
417 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
418 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
419 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
420 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
421 ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
422 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2
423 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe
424 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1
425 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
426 ; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
427 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
428 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
429 ; GFX12-WGP-NEXT: s_endpgm
431 ; GFX12-CU-LABEL: private_nontemporal_load_1:
432 ; GFX12-CU: ; %bb.0: ; %entry
433 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
434 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
435 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
436 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
437 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
438 ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
439 ; GFX12-CU-NEXT: s_mov_b32 s3, 2
440 ; GFX12-CU-NEXT: s_wait_alu 0xfffe
441 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1
442 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
443 ; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
444 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
445 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
446 ; GFX12-CU-NEXT: s_endpgm
447 ptr addrspace(5) %in, ptr addrspace(1) %out) {
449 %tid = call i32 @llvm.amdgcn.workitem.id.x()
450 %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
451 %val = load i32, ptr addrspace(5) %val.gep, align 4, !nontemporal !0
452 store i32 %val, ptr addrspace(1) %out
456 define amdgpu_kernel void @private_nontemporal_store_0(
457 ; GFX6-LABEL: private_nontemporal_store_0:
458 ; GFX6: ; %bb.0: ; %entry
459 ; GFX6-NEXT: s_add_u32 s0, s0, s15
460 ; GFX6-NEXT: s_addc_u32 s1, s1, 0
461 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
462 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2
463 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0
465 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
466 ; GFX6-NEXT: v_mov_b32_e32 v0, s5
467 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
468 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
469 ; GFX6-NEXT: s_endpgm
471 ; GFX7-LABEL: private_nontemporal_store_0:
472 ; GFX7: ; %bb.0: ; %entry
473 ; GFX7-NEXT: s_add_u32 s0, s0, s15
474 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
475 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
476 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
477 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
479 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
481 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
482 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
483 ; GFX7-NEXT: s_endpgm
485 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
486 ; GFX10-WGP: ; %bb.0: ; %entry
487 ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
488 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
489 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
490 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
491 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0
493 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
494 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
495 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
496 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
497 ; GFX10-WGP-NEXT: s_endpgm
499 ; GFX10-CU-LABEL: private_nontemporal_store_0:
500 ; GFX10-CU: ; %bb.0: ; %entry
501 ; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
502 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
503 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
504 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
505 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
506 ; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0
507 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
509 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
510 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
511 ; GFX10-CU-NEXT: s_endpgm
513 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
514 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
515 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
516 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
517 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
518 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
519 ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
520 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
521 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
522 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
523 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
524 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
525 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
526 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
527 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
528 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen glc slc
529 ; SKIP-CACHE-INV-NEXT: s_endpgm
531 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
532 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
533 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
534 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
535 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
536 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
537 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0
539 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
541 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
542 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
543 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
545 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
546 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
547 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
548 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
549 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
550 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
551 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
552 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0
553 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
554 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
555 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
556 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
557 ; GFX90A-TGSPLIT-NEXT: s_endpgm
559 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
560 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
561 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
562 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
563 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
564 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0
565 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
566 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
567 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 sc0 nt sc1
568 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
570 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
571 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
572 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
573 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
574 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
575 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0
576 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
578 ; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 sc0 nt sc1
579 ; GFX940-TGSPLIT-NEXT: s_endpgm
581 ; GFX11-WGP-LABEL: private_nontemporal_store_0:
582 ; GFX11-WGP: ; %bb.0: ; %entry
583 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
584 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
585 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
587 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
588 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
589 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
590 ; GFX11-WGP-NEXT: s_endpgm
592 ; GFX11-CU-LABEL: private_nontemporal_store_0:
593 ; GFX11-CU: ; %bb.0: ; %entry
594 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
595 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
596 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
598 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
600 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
601 ; GFX11-CU-NEXT: s_endpgm
603 ; GFX12-WGP-LABEL: private_nontemporal_store_0:
604 ; GFX12-WGP: ; %bb.0: ; %entry
605 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
606 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
607 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
608 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
609 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
610 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
611 ; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT
612 ; GFX12-WGP-NEXT: s_endpgm
614 ; GFX12-CU-LABEL: private_nontemporal_store_0:
615 ; GFX12-CU: ; %bb.0: ; %entry
616 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
617 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
618 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
619 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
620 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
621 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
622 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT
623 ; GFX12-CU-NEXT: s_endpgm
624 ptr addrspace(1) %in, ptr addrspace(5) %out) {
626 %val = load i32, ptr addrspace(1) %in, align 4
627 store i32 %val, ptr addrspace(5) %out, !nontemporal !0
631 define amdgpu_kernel void @private_nontemporal_store_1(
632 ; GFX6-LABEL: private_nontemporal_store_1:
633 ; GFX6: ; %bb.0: ; %entry
634 ; GFX6-NEXT: s_add_u32 s0, s0, s15
635 ; GFX6-NEXT: s_addc_u32 s1, s1, 0
636 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
637 ; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2
638 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
639 ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
640 ; GFX6-NEXT: s_mov_b32 s6, 2
641 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s6, v0
642 ; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], s5, v0
643 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
644 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
645 ; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
646 ; GFX6-NEXT: s_endpgm
648 ; GFX7-LABEL: private_nontemporal_store_1:
649 ; GFX7: ; %bb.0: ; %entry
650 ; GFX7-NEXT: s_add_u32 s0, s0, s15
651 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
652 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
653 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
654 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
656 ; GFX7-NEXT: s_mov_b32 s6, 2
657 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s6, v0
658 ; GFX7-NEXT: v_add_i32_e64 v1, s[6:7], s5, v0
659 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
660 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
661 ; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
662 ; GFX7-NEXT: s_endpgm
664 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
665 ; GFX10-WGP: ; %bb.0: ; %entry
666 ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
667 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
668 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
669 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
670 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
671 ; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0
672 ; GFX10-WGP-NEXT: s_mov_b32 s5, 2
673 ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6
674 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
675 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
676 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
677 ; GFX10-WGP-NEXT: s_endpgm
679 ; GFX10-CU-LABEL: private_nontemporal_store_1:
680 ; GFX10-CU: ; %bb.0: ; %entry
681 ; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
682 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
683 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
684 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
685 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0
687 ; GFX10-CU-NEXT: s_mov_b32 s5, 2
688 ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6
689 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
690 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
691 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
692 ; GFX10-CU-NEXT: s_endpgm
694 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
695 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
696 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
697 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
698 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
699 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
700 ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
701 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
702 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
703 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2
704 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
705 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
706 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2
707 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s2, v0
708 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v1, s[2:3], s1, v0
709 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
710 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
711 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen glc slc
712 ; SKIP-CACHE-INV-NEXT: s_endpgm
714 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
715 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
716 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
717 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
718 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
719 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
720 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
721 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
722 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
723 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
724 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2
725 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
726 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1
727 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
728 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
729 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
730 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
732 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
733 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
734 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
735 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
736 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
737 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
738 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
739 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
740 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
741 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
742 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2
743 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
744 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1
745 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
747 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
748 ; GFX90A-TGSPLIT-NEXT: s_endpgm
750 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
751 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
752 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
753 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
754 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
755 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
756 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
757 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
758 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2
759 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
760 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1
761 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
762 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
763 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v1, v0, off sc0 nt sc1
764 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
766 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
767 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
768 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
769 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
770 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
771 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
772 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
773 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
774 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 2
775 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
776 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1
777 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
778 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
779 ; GFX940-TGSPLIT-NEXT: scratch_store_dword v1, v0, off sc0 nt sc1
780 ; GFX940-TGSPLIT-NEXT: s_endpgm
782 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
783 ; GFX11-WGP: ; %bb.0: ; %entry
784 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
785 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
786 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
787 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
788 ; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff
789 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1
790 ; GFX11-WGP-NEXT: s_mov_b32 s1, 2
791 ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2
792 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
793 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
794 ; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off glc slc dlc
795 ; GFX11-WGP-NEXT: s_endpgm
797 ; GFX11-CU-LABEL: private_nontemporal_store_1:
798 ; GFX11-CU: ; %bb.0: ; %entry
799 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
800 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
801 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
802 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
803 ; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff
804 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1
805 ; GFX11-CU-NEXT: s_mov_b32 s1, 2
806 ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2
807 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
808 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
809 ; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off glc slc dlc
810 ; GFX11-CU-NEXT: s_endpgm
812 ; GFX12-WGP-LABEL: private_nontemporal_store_1:
813 ; GFX12-WGP: ; %bb.0: ; %entry
814 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
815 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
816 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
817 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
818 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
819 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
820 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2
821 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe
822 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
823 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
824 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
825 ; GFX12-WGP-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
826 ; GFX12-WGP-NEXT: s_endpgm
828 ; GFX12-CU-LABEL: private_nontemporal_store_1:
829 ; GFX12-CU: ; %bb.0: ; %entry
830 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
831 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
832 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
833 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
834 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
835 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
836 ; GFX12-CU-NEXT: s_mov_b32 s2, 2
837 ; GFX12-CU-NEXT: s_wait_alu 0xfffe
838 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
839 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
840 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
841 ; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
842 ; GFX12-CU-NEXT: s_endpgm
843 ptr addrspace(1) %in, ptr addrspace(5) %out) {
845 %tid = call i32 @llvm.amdgcn.workitem.id.x()
846 %val = load i32, ptr addrspace(1) %in, align 4
847 %out.gep = getelementptr inbounds i32, ptr addrspace(5) %out, i32 %tid
848 store i32 %val, ptr addrspace(5) %out.gep, !nontemporal !0
852 define amdgpu_kernel void @private_nontemporal_volatile_load(
853 ; GFX6-LABEL: private_nontemporal_volatile_load:
854 ; GFX6: ; %bb.0: ; %entry
855 ; GFX6-NEXT: s_add_u32 s0, s0, s15
856 ; GFX6-NEXT: s_addc_u32 s1, s1, 0
857 ; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
858 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
859 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
860 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
861 ; GFX6-NEXT: s_mov_b32 s11, s5
862 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
863 ; GFX6-NEXT: s_mov_b32 s9, 0x100f000
864 ; GFX6-NEXT: s_mov_b32 s10, -1
865 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
866 ; GFX6-NEXT: s_mov_b32 s5, s11
867 ; GFX6-NEXT: s_mov_b32 s6, s10
868 ; GFX6-NEXT: s_mov_b32 s7, s9
869 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
870 ; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
871 ; GFX6-NEXT: s_waitcnt vmcnt(0)
872 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
873 ; GFX6-NEXT: s_endpgm
875 ; GFX7-LABEL: private_nontemporal_volatile_load:
876 ; GFX7: ; %bb.0: ; %entry
877 ; GFX7-NEXT: s_add_u32 s0, s0, s15
878 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
879 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
880 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
881 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
882 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
883 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc
884 ; GFX7-NEXT: s_waitcnt vmcnt(0)
885 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
886 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
887 ; GFX7-NEXT: flat_store_dword v[0:1], v2
888 ; GFX7-NEXT: s_endpgm
890 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
891 ; GFX10-WGP: ; %bb.0: ; %entry
892 ; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
893 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
894 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
895 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
896 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
897 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
898 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
899 ; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
900 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
901 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
902 ; GFX10-WGP-NEXT: s_endpgm
904 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
905 ; GFX10-CU: ; %bb.0: ; %entry
906 ; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
907 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
908 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
909 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
910 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
911 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
912 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
913 ; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
914 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
915 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
916 ; GFX10-CU-NEXT: s_endpgm
918 ; SKIP-CACHE-INV-LABEL: private_nontemporal_volatile_load:
919 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
920 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
921 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
922 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
923 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
924 ; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
925 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
926 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
927 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
928 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
929 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
930 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
931 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
932 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
933 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
934 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
935 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
936 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
937 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
938 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
939 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc
940 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
941 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
942 ; SKIP-CACHE-INV-NEXT: s_endpgm
944 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
945 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
946 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
947 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
948 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
949 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
950 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
951 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
952 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
953 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc
954 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
955 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
956 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
958 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
959 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
960 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
961 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
962 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
963 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
964 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
965 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
966 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
967 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc
968 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
969 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
970 ; GFX90A-TGSPLIT-NEXT: s_endpgm
972 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
973 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
974 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
975 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
976 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
977 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
978 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1
979 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
980 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
981 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
983 ; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load:
984 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
985 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
986 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
987 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
988 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
989 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1
990 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
991 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
992 ; GFX940-TGSPLIT-NEXT: s_endpgm
994 ; GFX11-WGP-LABEL: private_nontemporal_volatile_load:
995 ; GFX11-WGP: ; %bb.0: ; %entry
996 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
997 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
998 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
999 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1000 ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc
1001 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
1002 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
1003 ; GFX11-WGP-NEXT: s_endpgm
1005 ; GFX11-CU-LABEL: private_nontemporal_volatile_load:
1006 ; GFX11-CU: ; %bb.0: ; %entry
1007 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
1008 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1009 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
1010 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1011 ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc
1012 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
1013 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
1014 ; GFX11-CU-NEXT: s_endpgm
1016 ; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
1017 ; GFX12-WGP: ; %bb.0: ; %entry
1018 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
1019 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1020 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
1021 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1022 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
1023 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1024 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1025 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
1026 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
1027 ; GFX12-WGP-NEXT: s_endpgm
1029 ; GFX12-CU-LABEL: private_nontemporal_volatile_load:
1030 ; GFX12-CU: ; %bb.0: ; %entry
1031 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
1032 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1033 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
1034 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1035 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
1036 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
1037 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
1038 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
1039 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
1040 ; GFX12-CU-NEXT: s_endpgm
1041 ptr addrspace(5) %in, ptr addrspace(1) %out) {
1043 %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0
1044 store i32 %val, ptr addrspace(1) %out
1049 declare i32 @llvm.amdgcn.workitem.id.x()