1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
14 define amdgpu_kernel void @private_nontemporal_load_0(
15 ; GFX6-LABEL: private_nontemporal_load_0:
16 ; GFX6: ; %bb.0: ; %entry
17 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
18 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
19 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
20 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
21 ; GFX6-NEXT: s_add_u32 s8, s8, s7
22 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
23 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
24 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
26 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
27 ; GFX6-NEXT: s_mov_b32 s2, -1
28 ; GFX6-NEXT: s_waitcnt vmcnt(0)
29 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
32 ; GFX7-LABEL: private_nontemporal_load_0:
33 ; GFX7: ; %bb.0: ; %entry
34 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
35 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
36 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
37 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
38 ; GFX7-NEXT: s_add_u32 s8, s8, s7
39 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
40 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
42 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
43 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
44 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
45 ; GFX7-NEXT: s_waitcnt vmcnt(0)
46 ; GFX7-NEXT: flat_store_dword v[0:1], v2
49 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
50 ; GFX10-WGP: ; %bb.0: ; %entry
51 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
52 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
53 ; GFX10-WGP-NEXT: s_clause 0x1
54 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
55 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
56 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
57 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
58 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
59 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
61 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
62 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
63 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
64 ; GFX10-WGP-NEXT: s_endpgm
66 ; GFX10-CU-LABEL: private_nontemporal_load_0:
67 ; GFX10-CU: ; %bb.0: ; %entry
68 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
69 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
70 ; GFX10-CU-NEXT: s_clause 0x1
71 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
72 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
73 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
74 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
75 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
76 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
77 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
78 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
79 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
80 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
81 ; GFX10-CU-NEXT: s_endpgm
83 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
84 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
85 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
86 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
87 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
88 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
89 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
90 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
91 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
92 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
93 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
94 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
95 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
96 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
97 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
98 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
99 ; SKIP-CACHE-INV-NEXT: s_endpgm
101 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
102 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
103 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
104 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
105 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
106 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
107 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
108 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
109 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
110 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
112 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
113 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
114 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
115 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
117 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
118 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
119 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
120 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
121 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
122 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
123 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
124 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
125 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
126 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
128 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
129 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
130 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
131 ; GFX90A-TGSPLIT-NEXT: s_endpgm
133 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
134 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
135 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
136 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
137 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
138 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
140 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
141 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
142 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
144 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
145 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
146 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
147 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
148 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
149 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
151 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
152 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
153 ; GFX940-TGSPLIT-NEXT: s_endpgm
155 ; GFX11-WGP-LABEL: private_nontemporal_load_0:
156 ; GFX11-WGP: ; %bb.0: ; %entry
157 ; GFX11-WGP-NEXT: s_clause 0x1
158 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
159 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
160 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
161 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc
163 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
164 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
165 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
166 ; GFX11-WGP-NEXT: s_endpgm
168 ; GFX11-CU-LABEL: private_nontemporal_load_0:
169 ; GFX11-CU: ; %bb.0: ; %entry
170 ; GFX11-CU-NEXT: s_clause 0x1
171 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
172 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
173 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
174 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
175 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc
176 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
177 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
178 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
179 ; GFX11-CU-NEXT: s_endpgm
180 i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
182 %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
183 store i32 %val, i32 addrspace(1)* %out
187 define amdgpu_kernel void @private_nontemporal_load_1(
188 ; GFX6-LABEL: private_nontemporal_load_1:
189 ; GFX6: ; %bb.0: ; %entry
190 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
191 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
192 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
193 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
194 ; GFX6-NEXT: s_add_u32 s8, s8, s7
195 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
196 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
197 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
199 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
200 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
201 ; GFX6-NEXT: s_mov_b32 s2, -1
202 ; GFX6-NEXT: s_waitcnt vmcnt(0)
203 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
204 ; GFX6-NEXT: s_endpgm
206 ; GFX7-LABEL: private_nontemporal_load_1:
207 ; GFX7: ; %bb.0: ; %entry
208 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
209 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
210 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
211 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
212 ; GFX7-NEXT: s_add_u32 s8, s8, s7
213 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
214 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
215 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
217 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
218 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
219 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
220 ; GFX7-NEXT: s_waitcnt vmcnt(0)
221 ; GFX7-NEXT: flat_store_dword v[0:1], v2
222 ; GFX7-NEXT: s_endpgm
224 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
225 ; GFX10-WGP: ; %bb.0: ; %entry
226 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
227 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
228 ; GFX10-WGP-NEXT: s_clause 0x1
229 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
230 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
231 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
232 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
233 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
234 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
235 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
236 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
237 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
238 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
239 ; GFX10-WGP-NEXT: s_endpgm
241 ; GFX10-CU-LABEL: private_nontemporal_load_1:
242 ; GFX10-CU: ; %bb.0: ; %entry
243 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
244 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
245 ; GFX10-CU-NEXT: s_clause 0x1
246 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
247 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
248 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
249 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
250 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
251 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
252 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
253 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
254 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
255 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
256 ; GFX10-CU-NEXT: s_endpgm
258 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
259 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
260 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
261 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
262 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
263 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
264 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
265 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
266 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
267 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
268 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
269 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
270 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
271 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
272 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
273 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
274 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
275 ; SKIP-CACHE-INV-NEXT: s_endpgm
277 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
278 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
279 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
280 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
281 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
282 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
283 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
284 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
285 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
286 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
287 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
288 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
289 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
290 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
291 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
293 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
294 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
295 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
296 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
297 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
298 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
299 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
300 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
301 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
302 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
303 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
304 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
305 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
306 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
307 ; GFX90A-TGSPLIT-NEXT: s_endpgm
309 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
310 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
311 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
312 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
313 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
314 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
315 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
316 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt
317 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
318 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
319 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
321 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
322 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
323 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
324 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
325 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
326 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
327 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt
329 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
330 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
331 ; GFX940-TGSPLIT-NEXT: s_endpgm
333 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
334 ; GFX11-WGP: ; %bb.0: ; %entry
335 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
336 ; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
337 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
338 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
340 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
341 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
342 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
343 ; GFX11-WGP-NEXT: s_endpgm
345 ; GFX11-CU-LABEL: private_nontemporal_load_1:
346 ; GFX11-CU: ; %bb.0: ; %entry
347 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
348 ; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
349 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
350 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
352 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
353 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
354 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
355 ; GFX11-CU-NEXT: s_endpgm
356 i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
358 %tid = call i32 @llvm.amdgcn.workitem.id.x()
359 %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
360 %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0
361 store i32 %val, i32 addrspace(1)* %out
365 define amdgpu_kernel void @private_nontemporal_store_0(
366 ; GFX6-LABEL: private_nontemporal_store_0:
367 ; GFX6: ; %bb.0: ; %entry
368 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
369 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
370 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
371 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
372 ; GFX6-NEXT: s_add_u32 s8, s8, s7
373 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
374 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
375 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
376 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
377 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
378 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
379 ; GFX6-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
380 ; GFX6-NEXT: s_endpgm
382 ; GFX7-LABEL: private_nontemporal_store_0:
383 ; GFX7: ; %bb.0: ; %entry
384 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
385 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
386 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
387 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
388 ; GFX7-NEXT: s_add_u32 s8, s8, s7
389 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
390 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
391 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
392 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
393 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
394 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
395 ; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
396 ; GFX7-NEXT: s_endpgm
398 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
399 ; GFX10-WGP: ; %bb.0: ; %entry
400 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
401 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
402 ; GFX10-WGP-NEXT: s_clause 0x1
403 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
404 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
405 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
406 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
407 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
409 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
410 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
411 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
412 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
413 ; GFX10-WGP-NEXT: s_endpgm
415 ; GFX10-CU-LABEL: private_nontemporal_store_0:
416 ; GFX10-CU: ; %bb.0: ; %entry
417 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
418 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
419 ; GFX10-CU-NEXT: s_clause 0x1
420 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
421 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
422 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
423 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
424 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
425 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
426 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
427 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
429 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
430 ; GFX10-CU-NEXT: s_endpgm
432 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
433 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
434 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
435 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
436 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
437 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
438 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
439 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
440 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
441 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
442 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
443 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
444 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
445 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
446 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
447 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
448 ; SKIP-CACHE-INV-NEXT: s_endpgm
450 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
451 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
452 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
453 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
454 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
455 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
456 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
457 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
458 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
460 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
461 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
462 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
463 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
464 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
466 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
467 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
468 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
469 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
470 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
471 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
472 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
473 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
474 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
476 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
477 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
479 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
480 ; GFX90A-TGSPLIT-NEXT: s_endpgm
482 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
483 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
484 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
485 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
486 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
488 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
490 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
491 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
493 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
494 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
495 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
496 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
497 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
498 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
499 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
500 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
501 ; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
502 ; GFX940-TGSPLIT-NEXT: s_endpgm
504 ; GFX11-WGP-LABEL: private_nontemporal_store_0:
505 ; GFX11-WGP: ; %bb.0: ; %entry
506 ; GFX11-WGP-NEXT: s_clause 0x1
507 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
508 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
509 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
511 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
512 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
513 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
514 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
515 ; GFX11-WGP-NEXT: s_endpgm
517 ; GFX11-CU-LABEL: private_nontemporal_store_0:
518 ; GFX11-CU: ; %bb.0: ; %entry
519 ; GFX11-CU-NEXT: s_clause 0x1
520 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
521 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
522 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
523 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
524 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
525 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
526 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
527 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
528 ; GFX11-CU-NEXT: s_endpgm
529 i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
531 %val = load i32, i32 addrspace(1)* %in, align 4
532 store i32 %val, i32 addrspace(5)* %out, !nontemporal !0
536 define amdgpu_kernel void @private_nontemporal_store_1(
537 ; GFX6-LABEL: private_nontemporal_store_1:
538 ; GFX6: ; %bb.0: ; %entry
539 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
540 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
541 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
542 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
543 ; GFX6-NEXT: s_add_u32 s8, s8, s7
544 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
545 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
546 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
547 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
548 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
549 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
551 ; GFX6-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
552 ; GFX6-NEXT: s_endpgm
554 ; GFX7-LABEL: private_nontemporal_store_1:
555 ; GFX7: ; %bb.0: ; %entry
556 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
557 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
558 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
559 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
560 ; GFX7-NEXT: s_add_u32 s8, s8, s7
561 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
562 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
563 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
564 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
565 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
566 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
568 ; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
569 ; GFX7-NEXT: s_endpgm
571 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
572 ; GFX10-WGP: ; %bb.0: ; %entry
573 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
574 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
575 ; GFX10-WGP-NEXT: s_clause 0x1
576 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
577 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
578 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
579 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
580 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
581 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
582 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
583 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
585 ; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
586 ; GFX10-WGP-NEXT: s_endpgm
588 ; GFX10-CU-LABEL: private_nontemporal_store_1:
589 ; GFX10-CU: ; %bb.0: ; %entry
590 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
591 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
592 ; GFX10-CU-NEXT: s_clause 0x1
593 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
594 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
595 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
596 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
597 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
599 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
600 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
601 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
602 ; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
603 ; GFX10-CU-NEXT: s_endpgm
605 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
606 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
607 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
608 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
609 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
610 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
611 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
612 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
613 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
614 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
615 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
616 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
617 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
618 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
619 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
620 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
621 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
622 ; SKIP-CACHE-INV-NEXT: s_endpgm
624 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
625 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
626 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
627 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
628 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
629 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
630 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
631 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
632 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
634 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
635 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
636 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
637 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
638 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
640 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
641 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
642 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
643 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
644 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
645 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
646 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
647 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
648 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
650 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
651 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
652 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
653 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
654 ; GFX90A-TGSPLIT-NEXT: s_endpgm
656 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
657 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
658 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
659 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
660 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
661 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
662 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
663 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
664 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
665 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt
666 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
668 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
669 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
670 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
671 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
672 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
673 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
674 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
675 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
676 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
677 ; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt
678 ; GFX940-TGSPLIT-NEXT: s_endpgm
680 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
681 ; GFX11-WGP: ; %bb.0: ; %entry
682 ; GFX11-WGP-NEXT: s_clause 0x1
683 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
684 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
685 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
687 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
688 ; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0
689 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
690 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
691 ; GFX11-WGP-NEXT: s_endpgm
693 ; GFX11-CU-LABEL: private_nontemporal_store_1:
694 ; GFX11-CU: ; %bb.0: ; %entry
695 ; GFX11-CU-NEXT: s_clause 0x1
696 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
697 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
698 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
699 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
700 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
701 ; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0
702 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
703 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
704 ; GFX11-CU-NEXT: s_endpgm
705 i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
707 %tid = call i32 @llvm.amdgcn.workitem.id.x()
708 %val = load i32, i32 addrspace(1)* %in, align 4
709 %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
710 store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0
715 declare i32 @llvm.amdgcn.workitem.id.x()