1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
14 define amdgpu_kernel void @private_nontemporal_load_0(
15 ; GFX6-LABEL: private_nontemporal_load_0:
16 ; GFX6: ; %bb.0: ; %entry
17 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
18 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
19 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
20 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
21 ; GFX6-NEXT: s_add_u32 s8, s8, s7
22 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
23 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
24 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
26 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
27 ; GFX6-NEXT: s_mov_b32 s2, -1
28 ; GFX6-NEXT: s_waitcnt vmcnt(0)
29 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
32 ; GFX7-LABEL: private_nontemporal_load_0:
33 ; GFX7: ; %bb.0: ; %entry
34 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
35 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
36 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
37 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
38 ; GFX7-NEXT: s_add_u32 s8, s8, s7
39 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
40 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
42 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
43 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
44 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
45 ; GFX7-NEXT: s_waitcnt vmcnt(0)
46 ; GFX7-NEXT: flat_store_dword v[0:1], v2
49 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
50 ; GFX10-WGP: ; %bb.0: ; %entry
51 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
52 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
53 ; GFX10-WGP-NEXT: s_clause 0x1
54 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
55 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
56 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
57 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
58 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
59 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
61 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
62 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
63 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
64 ; GFX10-WGP-NEXT: s_endpgm
66 ; GFX10-CU-LABEL: private_nontemporal_load_0:
67 ; GFX10-CU: ; %bb.0: ; %entry
68 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
69 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
70 ; GFX10-CU-NEXT: s_clause 0x1
71 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
72 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
73 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
74 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
75 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
76 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
77 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
78 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
79 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
80 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
81 ; GFX10-CU-NEXT: s_endpgm
83 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
84 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
85 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
86 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
87 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
88 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
89 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
90 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
91 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
92 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
93 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
94 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
95 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
96 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
97 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
98 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
99 ; SKIP-CACHE-INV-NEXT: s_endpgm
101 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
102 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
103 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
104 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
105 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
106 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
107 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
108 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
109 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
110 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
112 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
113 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
114 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
115 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
117 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
118 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
119 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
120 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
121 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
122 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
123 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
124 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
125 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
126 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
128 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
129 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
130 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
131 ; GFX90A-TGSPLIT-NEXT: s_endpgm
133 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
134 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
135 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
136 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
137 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
138 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
140 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
141 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
142 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
144 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
145 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
146 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
147 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
148 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
149 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
151 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
152 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
153 ; GFX940-TGSPLIT-NEXT: s_endpgm
155 ; GFX11-WGP-LABEL: private_nontemporal_load_0:
156 ; GFX11-WGP: ; %bb.0: ; %entry
157 ; GFX11-WGP-NEXT: s_clause 0x1
158 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
159 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
160 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
161 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc
163 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
164 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
165 ; GFX11-WGP-NEXT: s_nop 0
166 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
167 ; GFX11-WGP-NEXT: s_endpgm
169 ; GFX11-CU-LABEL: private_nontemporal_load_0:
170 ; GFX11-CU: ; %bb.0: ; %entry
171 ; GFX11-CU-NEXT: s_clause 0x1
172 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
173 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
174 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
175 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
176 ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc
177 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
178 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
179 ; GFX11-CU-NEXT: s_nop 0
180 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
181 ; GFX11-CU-NEXT: s_endpgm
182 ptr addrspace(5) %in, ptr addrspace(1) %out) {
184 %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
185 store i32 %val, ptr addrspace(1) %out
189 define amdgpu_kernel void @private_nontemporal_load_1(
190 ; GFX6-LABEL: private_nontemporal_load_1:
191 ; GFX6: ; %bb.0: ; %entry
192 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
193 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
194 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
195 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
196 ; GFX6-NEXT: s_add_u32 s8, s8, s7
197 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
198 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
199 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
201 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
202 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
203 ; GFX6-NEXT: s_mov_b32 s2, -1
204 ; GFX6-NEXT: s_waitcnt vmcnt(0)
205 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
206 ; GFX6-NEXT: s_endpgm
208 ; GFX7-LABEL: private_nontemporal_load_1:
209 ; GFX7: ; %bb.0: ; %entry
210 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
211 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
212 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
213 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
214 ; GFX7-NEXT: s_add_u32 s8, s8, s7
215 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
216 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
217 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
218 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
219 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
220 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
221 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
222 ; GFX7-NEXT: s_waitcnt vmcnt(0)
223 ; GFX7-NEXT: flat_store_dword v[0:1], v2
224 ; GFX7-NEXT: s_endpgm
226 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
227 ; GFX10-WGP: ; %bb.0: ; %entry
228 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
229 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
230 ; GFX10-WGP-NEXT: s_clause 0x1
231 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
232 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
233 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
234 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
235 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
236 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
238 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
239 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
240 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
241 ; GFX10-WGP-NEXT: s_endpgm
243 ; GFX10-CU-LABEL: private_nontemporal_load_1:
244 ; GFX10-CU: ; %bb.0: ; %entry
245 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
246 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
247 ; GFX10-CU-NEXT: s_clause 0x1
248 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
249 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
250 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
251 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
252 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
253 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
254 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
255 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
256 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
257 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
258 ; GFX10-CU-NEXT: s_endpgm
260 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
261 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
262 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
263 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
264 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
265 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
266 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
267 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
269 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
270 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
271 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
272 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
273 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
274 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
275 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
276 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
277 ; SKIP-CACHE-INV-NEXT: s_endpgm
279 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
280 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
281 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
282 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
283 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
284 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
285 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
286 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
287 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
288 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
290 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
291 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
292 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
293 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
295 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
296 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
297 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
298 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
299 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
300 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
301 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
302 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
303 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
304 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
306 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
307 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
308 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
309 ; GFX90A-TGSPLIT-NEXT: s_endpgm
311 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
312 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
313 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
314 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
315 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
316 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
317 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
318 ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
319 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
320 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
321 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
323 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
324 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
325 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
326 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
327 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
328 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
330 ; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
331 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
332 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
333 ; GFX940-TGSPLIT-NEXT: s_endpgm
335 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
336 ; GFX11-WGP: ; %bb.0: ; %entry
337 ; GFX11-WGP-NEXT: s_clause 0x1
338 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
339 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
340 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
341 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
343 ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off slc dlc
344 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
345 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
346 ; GFX11-WGP-NEXT: s_nop 0
347 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
348 ; GFX11-WGP-NEXT: s_endpgm
350 ; GFX11-CU-LABEL: private_nontemporal_load_1:
351 ; GFX11-CU: ; %bb.0: ; %entry
352 ; GFX11-CU-NEXT: s_clause 0x1
353 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
354 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
355 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
356 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
357 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
358 ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off slc dlc
359 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
360 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
361 ; GFX11-CU-NEXT: s_nop 0
362 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
363 ; GFX11-CU-NEXT: s_endpgm
364 ptr addrspace(5) %in, ptr addrspace(1) %out) {
366 %tid = call i32 @llvm.amdgcn.workitem.id.x()
367 %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
368 %val = load i32, ptr addrspace(5) %val.gep, align 4, !nontemporal !0
369 store i32 %val, ptr addrspace(1) %out
373 define amdgpu_kernel void @private_nontemporal_store_0(
374 ; GFX6-LABEL: private_nontemporal_store_0:
375 ; GFX6: ; %bb.0: ; %entry
376 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
377 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
378 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
379 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
380 ; GFX6-NEXT: s_add_u32 s8, s8, s7
381 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
382 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
383 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
384 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
385 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
387 ; GFX6-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
388 ; GFX6-NEXT: s_endpgm
390 ; GFX7-LABEL: private_nontemporal_store_0:
391 ; GFX7: ; %bb.0: ; %entry
392 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
393 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
394 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
395 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
396 ; GFX7-NEXT: s_add_u32 s8, s8, s7
397 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
398 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
400 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
401 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
403 ; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
404 ; GFX7-NEXT: s_endpgm
406 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
407 ; GFX10-WGP: ; %bb.0: ; %entry
408 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
409 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
410 ; GFX10-WGP-NEXT: s_clause 0x1
411 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
412 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
413 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
414 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
415 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
416 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
417 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
418 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
419 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
420 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
421 ; GFX10-WGP-NEXT: s_endpgm
423 ; GFX10-CU-LABEL: private_nontemporal_store_0:
424 ; GFX10-CU: ; %bb.0: ; %entry
425 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
426 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
427 ; GFX10-CU-NEXT: s_clause 0x1
428 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
429 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
430 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
431 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
432 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
434 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
435 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
436 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
437 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
438 ; GFX10-CU-NEXT: s_endpgm
440 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
441 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
442 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
443 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
444 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
445 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
446 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
447 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
448 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
449 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
450 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
451 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
452 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
453 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
454 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
455 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
456 ; SKIP-CACHE-INV-NEXT: s_endpgm
458 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
459 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
460 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
461 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
462 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
463 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
464 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
465 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
466 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
468 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
469 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
470 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
471 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
472 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
474 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
475 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
476 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
477 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
478 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
479 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
480 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
481 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
482 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
484 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
485 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
486 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
487 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
488 ; GFX90A-TGSPLIT-NEXT: s_endpgm
490 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
491 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
492 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
493 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
494 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
496 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
497 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
498 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s4 sc0 nt sc1
499 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
501 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
502 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
503 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
504 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
505 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
506 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
507 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
509 ; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 sc0 nt sc1
510 ; GFX940-TGSPLIT-NEXT: s_endpgm
512 ; GFX11-WGP-LABEL: private_nontemporal_store_0:
513 ; GFX11-WGP: ; %bb.0: ; %entry
514 ; GFX11-WGP-NEXT: s_clause 0x1
515 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
516 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
517 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
518 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
519 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
520 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
521 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
522 ; GFX11-WGP-NEXT: s_endpgm
524 ; GFX11-CU-LABEL: private_nontemporal_store_0:
525 ; GFX11-CU: ; %bb.0: ; %entry
526 ; GFX11-CU-NEXT: s_clause 0x1
527 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
528 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
529 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
531 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
532 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
533 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
534 ; GFX11-CU-NEXT: s_endpgm
535 ptr addrspace(1) %in, ptr addrspace(5) %out) {
537 %val = load i32, ptr addrspace(1) %in, align 4
538 store i32 %val, ptr addrspace(5) %out, !nontemporal !0
542 define amdgpu_kernel void @private_nontemporal_store_1(
543 ; GFX6-LABEL: private_nontemporal_store_1:
544 ; GFX6: ; %bb.0: ; %entry
545 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
546 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
547 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
548 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
549 ; GFX6-NEXT: s_add_u32 s8, s8, s7
550 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
551 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
552 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
553 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
554 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
555 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
556 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
557 ; GFX6-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
558 ; GFX6-NEXT: s_endpgm
560 ; GFX7-LABEL: private_nontemporal_store_1:
561 ; GFX7: ; %bb.0: ; %entry
562 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
563 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
564 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
565 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
566 ; GFX7-NEXT: s_add_u32 s8, s8, s7
567 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
568 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
569 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
570 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
571 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
572 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
574 ; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
575 ; GFX7-NEXT: s_endpgm
577 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
578 ; GFX10-WGP: ; %bb.0: ; %entry
579 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
580 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
581 ; GFX10-WGP-NEXT: s_clause 0x1
582 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
583 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
584 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
585 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
586 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
587 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
588 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
589 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
590 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
591 ; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
592 ; GFX10-WGP-NEXT: s_endpgm
594 ; GFX10-CU-LABEL: private_nontemporal_store_1:
595 ; GFX10-CU: ; %bb.0: ; %entry
596 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
597 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
598 ; GFX10-CU-NEXT: s_clause 0x1
599 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
600 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
601 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
602 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
603 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
604 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
605 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
606 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
607 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
608 ; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
609 ; GFX10-CU-NEXT: s_endpgm
611 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
612 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
613 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
614 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
615 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
616 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
617 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
618 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
619 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
620 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
621 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
622 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
623 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
624 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
625 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
626 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
627 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
628 ; SKIP-CACHE-INV-NEXT: s_endpgm
630 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
631 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
632 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
633 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
634 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
635 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
636 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
637 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
638 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
639 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
640 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
641 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
642 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
643 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
644 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
646 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
647 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
648 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
649 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
650 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
651 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
652 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
653 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
654 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
656 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
657 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
658 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
659 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
660 ; GFX90A-TGSPLIT-NEXT: s_endpgm
662 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
663 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
664 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
665 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
666 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
668 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
669 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
671 ; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off sc0 nt sc1
672 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
674 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
675 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
676 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
677 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
678 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
679 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
680 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
681 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
683 ; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off sc0 nt sc1
684 ; GFX940-TGSPLIT-NEXT: s_endpgm
686 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
687 ; GFX11-WGP: ; %bb.0: ; %entry
688 ; GFX11-WGP-NEXT: s_clause 0x1
689 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
690 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
691 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
692 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
693 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
694 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
695 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
696 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off glc slc dlc
697 ; GFX11-WGP-NEXT: s_endpgm
699 ; GFX11-CU-LABEL: private_nontemporal_store_1:
700 ; GFX11-CU: ; %bb.0: ; %entry
701 ; GFX11-CU-NEXT: s_clause 0x1
702 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
703 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
704 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
705 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
706 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
707 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
708 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
709 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off glc slc dlc
710 ; GFX11-CU-NEXT: s_endpgm
711 ptr addrspace(1) %in, ptr addrspace(5) %out) {
713 %tid = call i32 @llvm.amdgcn.workitem.id.x()
714 %val = load i32, ptr addrspace(1) %in, align 4
715 %out.gep = getelementptr inbounds i32, ptr addrspace(5) %out, i32 %tid
716 store i32 %val, ptr addrspace(5) %out.gep, !nontemporal !0
721 declare i32 @llvm.amdgcn.workitem.id.x()