1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
10 define amdgpu_kernel void @local_nontemporal_load_0(
11 ; GFX6-LABEL: local_nontemporal_load_0:
12 ; GFX6: ; %bb.0: ; %entry
13 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0
14 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
15 ; GFX6-NEXT: s_mov_b32 m0, -1
16 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
17 ; GFX6-NEXT: s_mov_b32 s2, -1
18 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
19 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
20 ; GFX6-NEXT: ds_read_b32 v0, v0
21 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
25 ; GFX7-LABEL: local_nontemporal_load_0:
26 ; GFX7: ; %bb.0: ; %entry
27 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
28 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
29 ; GFX7-NEXT: s_mov_b32 m0, -1
30 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
31 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
32 ; GFX7-NEXT: ds_read_b32 v2, v0
33 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
34 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
35 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
36 ; GFX7-NEXT: flat_store_dword v[0:1], v2
39 ; GFX10-WGP-LABEL: local_nontemporal_load_0:
40 ; GFX10-WGP: ; %bb.0: ; %entry
41 ; GFX10-WGP-NEXT: s_clause 0x1
42 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
43 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
44 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
45 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
47 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
48 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
49 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
50 ; GFX10-WGP-NEXT: s_endpgm
52 ; GFX10-CU-LABEL: local_nontemporal_load_0:
53 ; GFX10-CU: ; %bb.0: ; %entry
54 ; GFX10-CU-NEXT: s_clause 0x1
55 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
56 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
57 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
58 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
60 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
61 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
62 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
63 ; GFX10-CU-NEXT: s_endpgm
65 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
66 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
67 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
68 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
69 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
70 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
71 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
72 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
73 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
74 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
75 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
76 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
77 ; SKIP-CACHE-INV-NEXT: s_endpgm
79 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
80 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
81 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
82 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
83 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
84 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
86 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
87 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
88 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
89 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
91 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
92 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
93 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
94 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
95 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
96 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
98 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
99 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
101 ; GFX90A-TGSPLIT-NEXT: s_endpgm
104 i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
106 %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
107 store i32 %val, i32 addrspace(1)* %out
111 define amdgpu_kernel void @local_nontemporal_load_1(
112 ; GFX6-LABEL: local_nontemporal_load_1:
113 ; GFX6: ; %bb.0: ; %entry
114 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0
115 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
116 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
117 ; GFX6-NEXT: s_mov_b32 m0, -1
118 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
119 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0
121 ; GFX6-NEXT: ds_read_b32 v0, v0
122 ; GFX6-NEXT: s_mov_b32 s2, -1
123 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
125 ; GFX6-NEXT: s_endpgm
127 ; GFX7-LABEL: local_nontemporal_load_1:
128 ; GFX7: ; %bb.0: ; %entry
129 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
130 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
131 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
132 ; GFX7-NEXT: s_mov_b32 m0, -1
133 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
135 ; GFX7-NEXT: ds_read_b32 v2, v0
136 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
137 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
138 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX7-NEXT: flat_store_dword v[0:1], v2
140 ; GFX7-NEXT: s_endpgm
142 ; GFX10-WGP-LABEL: local_nontemporal_load_1:
143 ; GFX10-WGP: ; %bb.0: ; %entry
144 ; GFX10-WGP-NEXT: s_clause 0x1
145 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
146 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
147 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
148 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
149 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
150 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
151 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
153 ; GFX10-WGP-NEXT: s_endpgm
155 ; GFX10-CU-LABEL: local_nontemporal_load_1:
156 ; GFX10-CU: ; %bb.0: ; %entry
157 ; GFX10-CU-NEXT: s_clause 0x1
158 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
159 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
160 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
161 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
163 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
164 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
166 ; GFX10-CU-NEXT: s_endpgm
168 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
169 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
170 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
171 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
172 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
173 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
174 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
175 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
176 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0
177 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
178 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
179 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
180 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
181 ; SKIP-CACHE-INV-NEXT: s_endpgm
183 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
184 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
185 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
186 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
187 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
188 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
189 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
190 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
191 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
192 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
193 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
195 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
196 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
197 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
198 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
199 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
200 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
202 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
203 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
204 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
205 ; GFX90A-TGSPLIT-NEXT: s_endpgm
208 i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
210 %tid = call i32 @llvm.amdgcn.workitem.id.x()
211 %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
212 %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
213 store i32 %val, i32 addrspace(1)* %out
217 define amdgpu_kernel void @local_nontemporal_store_0(
218 ; GFX6-LABEL: local_nontemporal_store_0:
219 ; GFX6: ; %bb.0: ; %entry
220 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
221 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
222 ; GFX6-NEXT: s_mov_b32 m0, -1
223 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
224 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
225 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
226 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
228 ; GFX6-NEXT: ds_write_b32 v0, v1
229 ; GFX6-NEXT: s_endpgm
231 ; GFX7-LABEL: local_nontemporal_store_0:
232 ; GFX7: ; %bb.0: ; %entry
233 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
234 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
235 ; GFX7-NEXT: s_mov_b32 m0, -1
236 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
238 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
239 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
240 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
241 ; GFX7-NEXT: ds_write_b32 v0, v1
242 ; GFX7-NEXT: s_endpgm
244 ; GFX10-WGP-LABEL: local_nontemporal_store_0:
245 ; GFX10-WGP: ; %bb.0: ; %entry
246 ; GFX10-WGP-NEXT: s_clause 0x1
247 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
248 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
249 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
251 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
252 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
254 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
255 ; GFX10-WGP-NEXT: s_endpgm
257 ; GFX10-CU-LABEL: local_nontemporal_store_0:
258 ; GFX10-CU: ; %bb.0: ; %entry
259 ; GFX10-CU-NEXT: s_clause 0x1
260 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
261 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
262 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
264 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
265 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
267 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
268 ; GFX10-CU-NEXT: s_endpgm
270 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
271 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
272 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
273 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
274 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
275 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
276 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
277 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
278 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
279 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
280 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
281 ; SKIP-CACHE-INV-NEXT: s_endpgm
283 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
284 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
285 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
286 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
287 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
288 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
289 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
290 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
291 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
292 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
293 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
295 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
296 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
297 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
298 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
299 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
300 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
301 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
302 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
303 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
304 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
305 ; GFX90A-TGSPLIT-NEXT: s_endpgm
308 i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
310 %val = load i32, i32 addrspace(1)* %in, align 4
311 store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
315 define amdgpu_kernel void @local_nontemporal_store_1(
316 ; GFX6-LABEL: local_nontemporal_store_1:
317 ; GFX6: ; %bb.0: ; %entry
318 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
319 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
320 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
321 ; GFX6-NEXT: s_mov_b32 m0, -1
322 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
324 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
325 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
326 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
327 ; GFX6-NEXT: ds_write_b32 v0, v1
328 ; GFX6-NEXT: s_endpgm
330 ; GFX7-LABEL: local_nontemporal_store_1:
331 ; GFX7: ; %bb.0: ; %entry
332 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
333 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
334 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
335 ; GFX7-NEXT: s_mov_b32 m0, -1
336 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
338 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
339 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
340 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
341 ; GFX7-NEXT: ds_write_b32 v0, v1
342 ; GFX7-NEXT: s_endpgm
344 ; GFX10-WGP-LABEL: local_nontemporal_store_1:
345 ; GFX10-WGP: ; %bb.0: ; %entry
346 ; GFX10-WGP-NEXT: s_clause 0x1
347 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
348 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
349 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
350 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
351 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
352 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
354 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
355 ; GFX10-WGP-NEXT: s_endpgm
357 ; GFX10-CU-LABEL: local_nontemporal_store_1:
358 ; GFX10-CU: ; %bb.0: ; %entry
359 ; GFX10-CU-NEXT: s_clause 0x1
360 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
361 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
362 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
364 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
365 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
367 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
368 ; GFX10-CU-NEXT: s_endpgm
370 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
371 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
372 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
373 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
374 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
375 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
376 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
377 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
378 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
379 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
380 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
381 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
382 ; SKIP-CACHE-INV-NEXT: s_endpgm
384 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
385 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
386 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
387 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
388 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
390 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
391 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
393 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
394 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
396 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
397 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
398 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
399 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
400 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
402 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
403 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
404 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
405 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
406 ; GFX90A-TGSPLIT-NEXT: s_endpgm
409 i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
411 %tid = call i32 @llvm.amdgcn.workitem.id.x()
412 %val = load i32, i32 addrspace(1)* %in, align 4
413 %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
414 store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
419 declare i32 @llvm.amdgcn.workitem.id.x()