1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
10 define amdgpu_kernel void @private_nontemporal_load_0(
11 ; GFX6-LABEL: private_nontemporal_load_0:
12 ; GFX6: ; %bb.0: ; %entry
13 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
14 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
15 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0
16 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
17 ; GFX6-NEXT: s_add_u32 s8, s8, s7
18 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
19 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
20 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
21 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
22 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
23 ; GFX6-NEXT: s_mov_b32 s2, -1
24 ; GFX6-NEXT: s_waitcnt vmcnt(0)
25 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
28 ; GFX7-LABEL: private_nontemporal_load_0:
29 ; GFX7: ; %bb.0: ; %entry
30 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
31 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
32 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
33 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
34 ; GFX7-NEXT: s_add_u32 s8, s8, s7
35 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
36 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
38 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
39 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
40 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
41 ; GFX7-NEXT: s_waitcnt vmcnt(0)
42 ; GFX7-NEXT: flat_store_dword v[0:1], v2
45 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
46 ; GFX10-WGP: ; %bb.0: ; %entry
47 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
48 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
49 ; GFX10-WGP-NEXT: s_clause 0x1
50 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
51 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
52 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
53 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
54 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
55 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
57 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
58 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
59 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
60 ; GFX10-WGP-NEXT: s_endpgm
62 ; GFX10-CU-LABEL: private_nontemporal_load_0:
63 ; GFX10-CU: ; %bb.0: ; %entry
64 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
65 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
66 ; GFX10-CU-NEXT: s_clause 0x1
67 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
68 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
69 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
70 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
71 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
72 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
74 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
75 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
76 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
77 ; GFX10-CU-NEXT: s_endpgm
79 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
80 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
81 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9]
82 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0
83 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
84 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
85 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
86 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
87 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
88 ; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3
89 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0
90 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
91 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
92 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
93 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
94 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
95 ; SKIP-CACHE-INV-NEXT: s_endpgm
97 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
98 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
99 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
100 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
101 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
102 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
103 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
104 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
105 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
106 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
108 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen glc slc
109 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
110 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
111 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
113 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
114 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
115 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
116 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
117 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
118 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
119 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
120 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
121 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
122 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
124 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen glc slc
125 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
126 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
127 ; GFX90A-TGSPLIT-NEXT: s_endpgm
130 i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
132 %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
133 store i32 %val, i32 addrspace(1)* %out
137 define amdgpu_kernel void @private_nontemporal_load_1(
138 ; GFX6-LABEL: private_nontemporal_load_1:
139 ; GFX6: ; %bb.0: ; %entry
140 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
141 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
142 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0
143 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
144 ; GFX6-NEXT: s_add_u32 s8, s8, s7
145 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
146 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
147 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0
149 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
150 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
151 ; GFX6-NEXT: s_mov_b32 s2, -1
152 ; GFX6-NEXT: s_waitcnt vmcnt(0)
153 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
154 ; GFX6-NEXT: s_endpgm
156 ; GFX7-LABEL: private_nontemporal_load_1:
157 ; GFX7: ; %bb.0: ; %entry
158 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
159 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
160 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
161 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
162 ; GFX7-NEXT: s_add_u32 s8, s8, s7
163 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
164 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
165 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
166 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
167 ; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
168 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
169 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
170 ; GFX7-NEXT: s_waitcnt vmcnt(0)
171 ; GFX7-NEXT: flat_store_dword v[0:1], v2
172 ; GFX7-NEXT: s_endpgm
174 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
175 ; GFX10-WGP: ; %bb.0: ; %entry
176 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
177 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
178 ; GFX10-WGP-NEXT: s_clause 0x1
179 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
180 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
181 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
182 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
183 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
184 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
186 ; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
187 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
188 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
189 ; GFX10-WGP-NEXT: s_endpgm
191 ; GFX10-CU-LABEL: private_nontemporal_load_1:
192 ; GFX10-CU: ; %bb.0: ; %entry
193 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
194 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
195 ; GFX10-CU-NEXT: s_clause 0x1
196 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
197 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
198 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
199 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
200 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
201 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
202 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
203 ; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc
204 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
205 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
206 ; GFX10-CU-NEXT: s_endpgm
208 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
209 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
210 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9]
211 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0
212 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
213 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
214 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
215 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
216 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
217 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
218 ; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3
219 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0
220 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0
221 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
222 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
223 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
224 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
225 ; SKIP-CACHE-INV-NEXT: s_endpgm
227 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
228 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
229 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
230 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
231 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
232 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
233 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
234 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
235 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
236 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
238 ; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
239 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
240 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
241 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
243 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
244 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
245 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
246 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
247 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
248 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
249 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
250 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
251 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
252 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
254 ; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
255 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
256 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
257 ; GFX90A-TGSPLIT-NEXT: s_endpgm
260 i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
262 %tid = call i32 @llvm.amdgcn.workitem.id.x()
263 %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
264 %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0
265 store i32 %val, i32 addrspace(1)* %out
269 define amdgpu_kernel void @private_nontemporal_store_0(
270 ; GFX6-LABEL: private_nontemporal_store_0:
271 ; GFX6: ; %bb.0: ; %entry
272 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
273 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
274 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
275 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
276 ; GFX6-NEXT: s_add_u32 s8, s8, s7
277 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
278 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
279 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
280 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
281 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
283 ; GFX6-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
284 ; GFX6-NEXT: s_endpgm
286 ; GFX7-LABEL: private_nontemporal_store_0:
287 ; GFX7: ; %bb.0: ; %entry
288 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
289 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
290 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
291 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
292 ; GFX7-NEXT: s_add_u32 s8, s8, s7
293 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
294 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
295 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
296 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
297 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
298 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
299 ; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
300 ; GFX7-NEXT: s_endpgm
302 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
303 ; GFX10-WGP: ; %bb.0: ; %entry
304 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
305 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
306 ; GFX10-WGP-NEXT: s_clause 0x1
307 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
308 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
309 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
310 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
311 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
312 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
313 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
314 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
315 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
316 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc
317 ; GFX10-WGP-NEXT: s_endpgm
319 ; GFX10-CU-LABEL: private_nontemporal_store_0:
320 ; GFX10-CU: ; %bb.0: ; %entry
321 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
322 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
323 ; GFX10-CU-NEXT: s_clause 0x1
324 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
325 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
326 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
327 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
328 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
330 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
331 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
332 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
333 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc
334 ; GFX10-CU-NEXT: s_endpgm
336 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
337 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
338 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
339 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
340 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
341 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
342 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
343 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
344 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
345 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
346 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
347 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
348 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
349 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
350 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
351 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
352 ; SKIP-CACHE-INV-NEXT: s_endpgm
354 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
355 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
356 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
357 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
358 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
359 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
360 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
361 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
362 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
364 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
365 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
367 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
368 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
370 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
371 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
372 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
373 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
374 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
375 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
376 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
377 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
378 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
380 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
381 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
382 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
383 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
384 ; GFX90A-TGSPLIT-NEXT: s_endpgm
387 i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
389 %val = load i32, i32 addrspace(1)* %in, align 4
390 store i32 %val, i32 addrspace(5)* %out, !nontemporal !0
394 define amdgpu_kernel void @private_nontemporal_store_1(
395 ; GFX6-LABEL: private_nontemporal_store_1:
396 ; GFX6: ; %bb.0: ; %entry
397 ; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
398 ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1]
399 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
400 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
401 ; GFX6-NEXT: s_add_u32 s8, s8, s7
402 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
403 ; GFX6-NEXT: s_addc_u32 s9, s9, 0
404 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
405 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
406 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
407 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
409 ; GFX6-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
410 ; GFX6-NEXT: s_endpgm
412 ; GFX7-LABEL: private_nontemporal_store_1:
413 ; GFX7: ; %bb.0: ; %entry
414 ; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
415 ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
416 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
417 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
418 ; GFX7-NEXT: s_add_u32 s8, s8, s7
419 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
420 ; GFX7-NEXT: s_addc_u32 s9, s9, 0
421 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
422 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
423 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
424 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
425 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
426 ; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
427 ; GFX7-NEXT: s_endpgm
429 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
430 ; GFX10-WGP: ; %bb.0: ; %entry
431 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
432 ; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
433 ; GFX10-WGP-NEXT: s_clause 0x1
434 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
435 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
436 ; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
437 ; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
438 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
439 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
440 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
441 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
443 ; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc
444 ; GFX10-WGP-NEXT: s_endpgm
446 ; GFX10-CU-LABEL: private_nontemporal_store_1:
447 ; GFX10-CU: ; %bb.0: ; %entry
448 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
449 ; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
450 ; GFX10-CU-NEXT: s_clause 0x1
451 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
452 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
453 ; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
454 ; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
455 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
457 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
458 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
460 ; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc
461 ; GFX10-CU-NEXT: s_endpgm
463 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
464 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
465 ; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
466 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
467 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
468 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
469 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
470 ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
471 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
472 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
473 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
474 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
475 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
476 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
477 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
478 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
479 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
480 ; SKIP-CACHE-INV-NEXT: s_endpgm
482 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
483 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
484 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
485 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
486 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
487 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
488 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7
489 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0
490 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
491 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
492 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
493 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
494 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
495 ; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
496 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
498 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
499 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
500 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3]
501 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1]
502 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
503 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
504 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7
505 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0
506 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
507 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
508 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
509 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
511 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
512 ; GFX90A-TGSPLIT-NEXT: s_endpgm
515 i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
517 %tid = call i32 @llvm.amdgcn.workitem.id.x()
518 %val = load i32, i32 addrspace(1)* %in, align 4
519 %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
520 store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0
525 declare i32 @llvm.amdgcn.workitem.id.x()