1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
14 define amdgpu_kernel void @local_nontemporal_load_0(
15 ; GFX6-LABEL: local_nontemporal_load_0:
16 ; GFX6: ; %bb.0: ; %entry
17 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
18 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
19 ; GFX6-NEXT: s_mov_b32 m0, -1
20 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
21 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
23 ; GFX6-NEXT: ds_read_b32 v0, v0
24 ; GFX6-NEXT: s_mov_b32 s2, -1
25 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
26 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
29 ; GFX7-LABEL: local_nontemporal_load_0:
30 ; GFX7: ; %bb.0: ; %entry
31 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
32 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
33 ; GFX7-NEXT: s_mov_b32 m0, -1
34 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
36 ; GFX7-NEXT: ds_read_b32 v2, v0
37 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
38 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
39 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX7-NEXT: flat_store_dword v[0:1], v2
43 ; GFX10-WGP-LABEL: local_nontemporal_load_0:
44 ; GFX10-WGP: ; %bb.0: ; %entry
45 ; GFX10-WGP-NEXT: s_clause 0x1
46 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
47 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
48 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
49 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
51 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
52 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
54 ; GFX10-WGP-NEXT: s_endpgm
56 ; GFX10-CU-LABEL: local_nontemporal_load_0:
57 ; GFX10-CU: ; %bb.0: ; %entry
58 ; GFX10-CU-NEXT: s_clause 0x1
59 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
60 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
61 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
62 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
63 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
64 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
65 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
67 ; GFX10-CU-NEXT: s_endpgm
69 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
70 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
71 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
72 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
73 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
74 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
75 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
76 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
77 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
78 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
79 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
80 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
81 ; SKIP-CACHE-INV-NEXT: s_endpgm
83 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
84 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
85 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
86 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
87 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
88 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
89 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
90 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
91 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
92 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
93 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
95 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
96 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
97 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
98 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
99 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
100 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
101 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
102 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
103 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
105 ; GFX90A-TGSPLIT-NEXT: s_endpgm
107 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
108 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
109 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
110 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
111 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
112 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
114 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
115 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
116 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
117 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
119 ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0:
120 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
121 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
122 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
123 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
124 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
126 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
127 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
129 ; GFX940-TGSPLIT-NEXT: s_endpgm
131 ; GFX11-WGP-LABEL: local_nontemporal_load_0:
132 ; GFX11-WGP: ; %bb.0: ; %entry
133 ; GFX11-WGP-NEXT: s_clause 0x1
134 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
135 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
136 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
137 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
139 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0
140 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
142 ; GFX11-WGP-NEXT: s_endpgm
144 ; GFX11-CU-LABEL: local_nontemporal_load_0:
145 ; GFX11-CU: ; %bb.0: ; %entry
146 ; GFX11-CU-NEXT: s_clause 0x1
147 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
148 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
149 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
150 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
152 ; GFX11-CU-NEXT: ds_load_b32 v0, v0
153 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
155 ; GFX11-CU-NEXT: s_endpgm
156 i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
158 %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
159 store i32 %val, i32 addrspace(1)* %out
163 define amdgpu_kernel void @local_nontemporal_load_1(
164 ; GFX6-LABEL: local_nontemporal_load_1:
165 ; GFX6: ; %bb.0: ; %entry
166 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0
167 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
168 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
169 ; GFX6-NEXT: s_mov_b32 m0, -1
170 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000
171 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
172 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
173 ; GFX6-NEXT: ds_read_b32 v0, v0
174 ; GFX6-NEXT: s_mov_b32 s2, -1
175 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
176 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
177 ; GFX6-NEXT: s_endpgm
179 ; GFX7-LABEL: local_nontemporal_load_1:
180 ; GFX7: ; %bb.0: ; %entry
181 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
182 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
183 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
184 ; GFX7-NEXT: s_mov_b32 m0, -1
185 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
187 ; GFX7-NEXT: ds_read_b32 v2, v0
188 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
189 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
190 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
191 ; GFX7-NEXT: flat_store_dword v[0:1], v2
192 ; GFX7-NEXT: s_endpgm
194 ; GFX10-WGP-LABEL: local_nontemporal_load_1:
195 ; GFX10-WGP: ; %bb.0: ; %entry
196 ; GFX10-WGP-NEXT: s_clause 0x1
197 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
198 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
199 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
200 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
202 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
203 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
204 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
205 ; GFX10-WGP-NEXT: s_endpgm
207 ; GFX10-CU-LABEL: local_nontemporal_load_1:
208 ; GFX10-CU: ; %bb.0: ; %entry
209 ; GFX10-CU-NEXT: s_clause 0x1
210 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
211 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
212 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
213 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
215 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
216 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
218 ; GFX10-CU-NEXT: s_endpgm
220 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
221 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
222 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
223 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
224 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
225 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
226 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
227 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
228 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
229 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
230 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
231 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
232 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
233 ; SKIP-CACHE-INV-NEXT: s_endpgm
235 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
236 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
237 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
238 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
239 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
240 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
242 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
243 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
245 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
247 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
248 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
249 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
250 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
251 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
252 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
254 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
255 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
257 ; GFX90A-TGSPLIT-NEXT: s_endpgm
259 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
260 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
261 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
262 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
263 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
264 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
265 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
266 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
267 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
269 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
271 ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1:
272 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
273 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
274 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
275 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
276 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
277 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
278 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
279 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
280 ; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
281 ; GFX940-TGSPLIT-NEXT: s_endpgm
283 ; GFX11-WGP-LABEL: local_nontemporal_load_1:
284 ; GFX11-WGP: ; %bb.0: ; %entry
285 ; GFX11-WGP-NEXT: s_clause 0x1
286 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
287 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
288 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
289 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
291 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0
292 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
293 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
294 ; GFX11-WGP-NEXT: s_endpgm
296 ; GFX11-CU-LABEL: local_nontemporal_load_1:
297 ; GFX11-CU: ; %bb.0: ; %entry
298 ; GFX11-CU-NEXT: s_clause 0x1
299 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
300 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
301 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
302 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
303 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
304 ; GFX11-CU-NEXT: ds_load_b32 v0, v0
305 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
307 ; GFX11-CU-NEXT: s_endpgm
308 i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
310 %tid = call i32 @llvm.amdgcn.workitem.id.x()
311 %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
312 %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
313 store i32 %val, i32 addrspace(1)* %out
317 define amdgpu_kernel void @local_nontemporal_store_0(
318 ; GFX6-LABEL: local_nontemporal_store_0:
319 ; GFX6: ; %bb.0: ; %entry
320 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
321 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
322 ; GFX6-NEXT: s_mov_b32 m0, -1
323 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
324 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
325 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
326 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
327 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
328 ; GFX6-NEXT: ds_write_b32 v0, v1
329 ; GFX6-NEXT: s_endpgm
331 ; GFX7-LABEL: local_nontemporal_store_0:
332 ; GFX7: ; %bb.0: ; %entry
333 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
334 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
335 ; GFX7-NEXT: s_mov_b32 m0, -1
336 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
338 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
339 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
340 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
341 ; GFX7-NEXT: ds_write_b32 v0, v1
342 ; GFX7-NEXT: s_endpgm
344 ; GFX10-WGP-LABEL: local_nontemporal_store_0:
345 ; GFX10-WGP: ; %bb.0: ; %entry
346 ; GFX10-WGP-NEXT: s_clause 0x1
347 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
348 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
349 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
350 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
351 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
352 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
354 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
355 ; GFX10-WGP-NEXT: s_endpgm
357 ; GFX10-CU-LABEL: local_nontemporal_store_0:
358 ; GFX10-CU: ; %bb.0: ; %entry
359 ; GFX10-CU-NEXT: s_clause 0x1
360 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
361 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
362 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
364 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
365 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
367 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
368 ; GFX10-CU-NEXT: s_endpgm
370 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
371 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
372 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
373 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
374 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
375 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
376 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
377 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
378 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
379 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
380 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
381 ; SKIP-CACHE-INV-NEXT: s_endpgm
383 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
384 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
385 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
386 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
387 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
389 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
390 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
391 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
392 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
393 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
395 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
396 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
397 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
398 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
399 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
401 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
402 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
403 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
404 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
405 ; GFX90A-TGSPLIT-NEXT: s_endpgm
407 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
408 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
409 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
410 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
411 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
412 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
413 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
414 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
415 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
416 ; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
417 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
419 ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0:
420 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
421 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
422 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
423 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
425 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
426 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
427 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
428 ; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
429 ; GFX940-TGSPLIT-NEXT: s_endpgm
431 ; GFX11-WGP-LABEL: local_nontemporal_store_0:
432 ; GFX11-WGP: ; %bb.0: ; %entry
433 ; GFX11-WGP-NEXT: s_clause 0x1
434 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
435 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
436 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
437 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
438 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
439 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
441 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
442 ; GFX11-WGP-NEXT: s_endpgm
444 ; GFX11-CU-LABEL: local_nontemporal_store_0:
445 ; GFX11-CU: ; %bb.0: ; %entry
446 ; GFX11-CU-NEXT: s_clause 0x1
447 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
448 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
449 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
450 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
451 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
452 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
453 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
454 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
455 ; GFX11-CU-NEXT: s_endpgm
456 i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
458 %val = load i32, i32 addrspace(1)* %in, align 4
459 store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
463 define amdgpu_kernel void @local_nontemporal_store_1(
464 ; GFX6-LABEL: local_nontemporal_store_1:
465 ; GFX6: ; %bb.0: ; %entry
466 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
467 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
468 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
469 ; GFX6-NEXT: s_mov_b32 m0, -1
470 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
471 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
472 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
473 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
474 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
475 ; GFX6-NEXT: ds_write_b32 v0, v1
476 ; GFX6-NEXT: s_endpgm
478 ; GFX7-LABEL: local_nontemporal_store_1:
479 ; GFX7: ; %bb.0: ; %entry
480 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
481 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
482 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
483 ; GFX7-NEXT: s_mov_b32 m0, -1
484 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
486 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
487 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
489 ; GFX7-NEXT: ds_write_b32 v0, v1
490 ; GFX7-NEXT: s_endpgm
492 ; GFX10-WGP-LABEL: local_nontemporal_store_1:
493 ; GFX10-WGP: ; %bb.0: ; %entry
494 ; GFX10-WGP-NEXT: s_clause 0x1
495 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
496 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
497 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
498 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
499 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
500 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
502 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
503 ; GFX10-WGP-NEXT: s_endpgm
505 ; GFX10-CU-LABEL: local_nontemporal_store_1:
506 ; GFX10-CU: ; %bb.0: ; %entry
507 ; GFX10-CU-NEXT: s_clause 0x1
508 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
509 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
510 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
511 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
512 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
513 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
515 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
516 ; GFX10-CU-NEXT: s_endpgm
518 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
519 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
520 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
521 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
522 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
523 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
524 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
525 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
526 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
527 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
528 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
529 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
530 ; SKIP-CACHE-INV-NEXT: s_endpgm
532 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
533 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
534 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
535 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
536 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
537 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
538 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
539 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
541 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
542 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
544 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
545 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
546 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
547 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
548 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
549 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2
550 ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
551 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
552 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
553 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
554 ; GFX90A-TGSPLIT-NEXT: s_endpgm
556 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
557 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
558 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
559 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
560 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
562 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
563 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
564 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
565 ; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
566 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
568 ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1:
569 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
570 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
571 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
572 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
574 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
575 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
576 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
577 ; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
578 ; GFX940-TGSPLIT-NEXT: s_endpgm
580 ; GFX11-WGP-LABEL: local_nontemporal_store_1:
581 ; GFX11-WGP: ; %bb.0: ; %entry
582 ; GFX11-WGP-NEXT: s_clause 0x1
583 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
584 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
585 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
587 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
588 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
589 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
590 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
591 ; GFX11-WGP-NEXT: s_endpgm
593 ; GFX11-CU-LABEL: local_nontemporal_store_1:
594 ; GFX11-CU: ; %bb.0: ; %entry
595 ; GFX11-CU-NEXT: s_clause 0x1
596 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
597 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
598 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
600 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
601 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
602 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
603 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
604 ; GFX11-CU-NEXT: s_endpgm
605 i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
607 %tid = call i32 @llvm.amdgcn.workitem.id.x()
608 %val = load i32, i32 addrspace(1)* %in, align 4
609 %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
610 store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
615 declare i32 @llvm.amdgcn.workitem.id.x()