1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
16 define amdgpu_kernel void @local_nontemporal_load_0(
17 ; GFX6-LABEL: local_nontemporal_load_0:
18 ; GFX6: ; %bb.0: ; %entry
19 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0
20 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr8
21 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
22 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
23 ; GFX6-NEXT: s_mov_b32 s11, s5
24 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
25 ; GFX6-NEXT: s_mov_b32 s9, 0x100f000
26 ; GFX6-NEXT: s_mov_b32 s10, -1
27 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
28 ; GFX6-NEXT: s_mov_b32 s5, s11
29 ; GFX6-NEXT: s_mov_b32 s6, s10
30 ; GFX6-NEXT: s_mov_b32 s7, s9
31 ; GFX6-NEXT: s_mov_b32 m0, -1
32 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
33 ; GFX6-NEXT: ds_read_b32 v0, v0
34 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
38 ; GFX7-LABEL: local_nontemporal_load_0:
39 ; GFX7: ; %bb.0: ; %entry
40 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
41 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
42 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
43 ; GFX7-NEXT: s_mov_b32 m0, -1
44 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
45 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
46 ; GFX7-NEXT: ds_read_b32 v2, v0
47 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
48 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
49 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX7-NEXT: flat_store_dword v[0:1], v2
53 ; GFX10-WGP-LABEL: local_nontemporal_load_0:
54 ; GFX10-WGP: ; %bb.0: ; %entry
55 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
56 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0
57 ; GFX10-WGP-NEXT: s_nop 0
58 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
59 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
60 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
62 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1
63 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
64 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
65 ; GFX10-WGP-NEXT: s_endpgm
67 ; GFX10-CU-LABEL: local_nontemporal_load_0:
68 ; GFX10-CU: ; %bb.0: ; %entry
69 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
70 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0
71 ; GFX10-CU-NEXT: s_nop 0
72 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
73 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
74 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
76 ; GFX10-CU-NEXT: ds_read_b32 v1, v1
77 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
79 ; GFX10-CU-NEXT: s_endpgm
81 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
82 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
83 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0
84 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
85 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
86 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
87 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
88 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
89 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
90 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
91 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
92 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
93 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
94 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
95 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
96 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
97 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
98 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
99 ; SKIP-CACHE-INV-NEXT: s_endpgm
101 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
102 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
103 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
104 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0
105 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
106 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
107 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
108 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
110 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
111 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
112 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
113 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
115 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
116 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
117 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
118 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0
119 ; GFX90A-TGSPLIT-NEXT: s_nop 0
120 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
121 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
122 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
124 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
125 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
126 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
127 ; GFX90A-TGSPLIT-NEXT: s_endpgm
129 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
130 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
131 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
132 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0
133 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
134 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
135 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
136 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
137 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
138 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
139 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
141 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
143 ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0:
144 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
145 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
146 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0
147 ; GFX940-TGSPLIT-NEXT: s_nop 0
148 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
149 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
150 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
152 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v1
153 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
155 ; GFX940-TGSPLIT-NEXT: s_endpgm
157 ; GFX11-WGP-LABEL: local_nontemporal_load_0:
158 ; GFX11-WGP: ; %bb.0: ; %entry
159 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
160 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
161 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
162 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
163 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
164 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
165 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1
166 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
168 ; GFX11-WGP-NEXT: s_endpgm
170 ; GFX11-CU-LABEL: local_nontemporal_load_0:
171 ; GFX11-CU: ; %bb.0: ; %entry
172 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
173 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
174 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
175 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
176 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
177 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
178 ; GFX11-CU-NEXT: ds_load_b32 v1, v1
179 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
180 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
181 ; GFX11-CU-NEXT: s_endpgm
183 ; GFX12-WGP-LABEL: local_nontemporal_load_0:
184 ; GFX12-WGP: ; %bb.0: ; %entry
185 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
186 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
187 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
188 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
189 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
190 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
191 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1
192 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
193 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
194 ; GFX12-WGP-NEXT: s_endpgm
196 ; GFX12-CU-LABEL: local_nontemporal_load_0:
197 ; GFX12-CU: ; %bb.0: ; %entry
198 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
199 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
200 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
201 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
202 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
203 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
204 ; GFX12-CU-NEXT: ds_load_b32 v1, v1
205 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
206 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
207 ; GFX12-CU-NEXT: s_endpgm
208 ptr addrspace(3) %in, ptr addrspace(1) %out) {
210 %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0
211 store i32 %val, ptr addrspace(1) %out
215 define amdgpu_kernel void @local_nontemporal_load_1(
216 ; GFX6-LABEL: local_nontemporal_load_1:
217 ; GFX6: ; %bb.0: ; %entry
218 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0
219 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr8
220 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
221 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX6-NEXT: s_mov_b32 s11, s5
223 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
224 ; GFX6-NEXT: s_mov_b32 s9, 0x100f000
225 ; GFX6-NEXT: s_mov_b32 s10, -1
226 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
227 ; GFX6-NEXT: s_mov_b32 s5, s11
228 ; GFX6-NEXT: s_mov_b32 s6, s10
229 ; GFX6-NEXT: s_mov_b32 s7, s9
230 ; GFX6-NEXT: s_mov_b32 s9, 2
231 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s9, v0
232 ; GFX6-NEXT: v_add_i32_e64 v0, s[8:9], s8, v0
233 ; GFX6-NEXT: s_mov_b32 m0, -1
234 ; GFX6-NEXT: ds_read_b32 v0, v0
235 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
236 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
237 ; GFX6-NEXT: s_endpgm
239 ; GFX7-LABEL: local_nontemporal_load_1:
240 ; GFX7: ; %bb.0: ; %entry
241 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
242 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
243 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
244 ; GFX7-NEXT: s_mov_b32 s7, 2
245 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0
246 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
248 ; GFX7-NEXT: s_mov_b32 m0, -1
249 ; GFX7-NEXT: ds_read_b32 v2, v0
250 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
251 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
252 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX7-NEXT: flat_store_dword v[0:1], v2
254 ; GFX7-NEXT: s_endpgm
256 ; GFX10-WGP-LABEL: local_nontemporal_load_1:
257 ; GFX10-WGP: ; %bb.0: ; %entry
258 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
259 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
260 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x0
261 ; GFX10-WGP-NEXT: s_nop 0
262 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
263 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
264 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2
265 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7
267 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1
268 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
269 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
270 ; GFX10-WGP-NEXT: s_endpgm
272 ; GFX10-CU-LABEL: local_nontemporal_load_1:
273 ; GFX10-CU: ; %bb.0: ; %entry
274 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
275 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
276 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x0
277 ; GFX10-CU-NEXT: s_nop 0
278 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
279 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
280 ; GFX10-CU-NEXT: s_mov_b32 s6, 2
281 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7
283 ; GFX10-CU-NEXT: ds_read_b32 v1, v1
284 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
286 ; GFX10-CU-NEXT: s_endpgm
288 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
289 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
290 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0
291 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
292 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
293 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
294 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
295 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
296 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
297 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
298 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
299 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
300 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
301 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
302 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s5, v0
303 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0
304 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
305 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
306 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
307 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
308 ; SKIP-CACHE-INV-NEXT: s_endpgm
310 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
311 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
312 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
313 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
314 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[4:5], 0x0
315 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
316 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
317 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
318 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
319 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
320 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2
321 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
323 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
324 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
325 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
326 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
327 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
329 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
330 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
331 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
332 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
333 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[4:5], 0x0
334 ; GFX90A-TGSPLIT-NEXT: s_nop 0
335 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
336 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
337 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
338 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
339 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2
340 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
342 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
343 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
344 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
345 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
346 ; GFX90A-TGSPLIT-NEXT: s_endpgm
348 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
349 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
350 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
351 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
352 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[0:1], 0x0
353 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
354 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
355 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
356 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
357 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
358 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2
359 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
361 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
362 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
363 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
365 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
367 ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1:
368 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
369 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
370 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
371 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[0:1], 0x0
372 ; GFX940-TGSPLIT-NEXT: s_nop 0
373 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
374 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
375 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
376 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
377 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 2
378 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
380 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
381 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v1
382 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
383 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
384 ; GFX940-TGSPLIT-NEXT: s_endpgm
386 ; GFX11-WGP-LABEL: local_nontemporal_load_1:
387 ; GFX11-WGP: ; %bb.0: ; %entry
388 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
389 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
390 ; GFX11-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0
391 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
392 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
393 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
394 ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2
395 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2
396 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
398 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1
399 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
401 ; GFX11-WGP-NEXT: s_endpgm
403 ; GFX11-CU-LABEL: local_nontemporal_load_1:
404 ; GFX11-CU: ; %bb.0: ; %entry
405 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
406 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
407 ; GFX11-CU-NEXT: s_load_b32 s3, s[0:1], 0x0
408 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
409 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
410 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
411 ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2
412 ; GFX11-CU-NEXT: s_mov_b32 s2, 2
413 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
414 ; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
415 ; GFX11-CU-NEXT: ds_load_b32 v1, v1
416 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
417 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
418 ; GFX11-CU-NEXT: s_endpgm
420 ; GFX12-WGP-LABEL: local_nontemporal_load_1:
421 ; GFX12-WGP: ; %bb.0: ; %entry
422 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
423 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
424 ; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0
425 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
426 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
427 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
428 ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
429 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2
430 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
431 ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
432 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1
433 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
434 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
435 ; GFX12-WGP-NEXT: s_endpgm
437 ; GFX12-CU-LABEL: local_nontemporal_load_1:
438 ; GFX12-CU: ; %bb.0: ; %entry
439 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
440 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
441 ; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0
442 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
443 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
444 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
445 ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
446 ; GFX12-CU-NEXT: s_mov_b32 s2, 2
447 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
448 ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
449 ; GFX12-CU-NEXT: ds_load_b32 v1, v1
450 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
451 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
452 ; GFX12-CU-NEXT: s_endpgm
453 ptr addrspace(3) %in, ptr addrspace(1) %out) {
455 %tid = call i32 @llvm.amdgcn.workitem.id.x()
456 %val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid
457 %val = load i32, ptr addrspace(3) %val.gep, align 4, !nontemporal !0
458 store i32 %val, ptr addrspace(1) %out
462 define amdgpu_kernel void @local_nontemporal_store_0(
463 ; GFX6-LABEL: local_nontemporal_store_0:
464 ; GFX6: ; %bb.0: ; %entry
465 ; GFX6-NEXT: s_load_dword s5, s[6:7], 0x2
466 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5
467 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
468 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
469 ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
470 ; GFX6-NEXT: s_mov_b32 m0, -1
471 ; GFX6-NEXT: v_mov_b32_e32 v0, s5
472 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
473 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
474 ; GFX6-NEXT: ds_write_b32 v0, v1
475 ; GFX6-NEXT: s_endpgm
477 ; GFX7-LABEL: local_nontemporal_store_0:
478 ; GFX7: ; %bb.0: ; %entry
479 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
480 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
481 ; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2
482 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
484 ; GFX7-NEXT: s_mov_b32 m0, -1
485 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
486 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
488 ; GFX7-NEXT: ds_write_b32 v0, v1
489 ; GFX7-NEXT: s_endpgm
491 ; GFX10-WGP-LABEL: local_nontemporal_store_0:
492 ; GFX10-WGP: ; %bb.0: ; %entry
493 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
494 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
495 ; GFX10-WGP-NEXT: s_nop 0
496 ; GFX10-WGP-NEXT: s_load_dword s5, s[4:5], 0x8
497 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
498 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
499 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
500 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
502 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
503 ; GFX10-WGP-NEXT: s_endpgm
505 ; GFX10-CU-LABEL: local_nontemporal_store_0:
506 ; GFX10-CU: ; %bb.0: ; %entry
507 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
508 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
509 ; GFX10-CU-NEXT: s_nop 0
510 ; GFX10-CU-NEXT: s_load_dword s5, s[4:5], 0x8
511 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
512 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
513 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
514 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
516 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
517 ; GFX10-CU-NEXT: s_endpgm
519 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
520 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
521 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
522 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
523 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2
524 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
525 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
526 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
527 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
528 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
529 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
530 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
531 ; SKIP-CACHE-INV-NEXT: s_endpgm
533 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
534 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
535 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
536 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
537 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
538 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[4:5], 0x8
539 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
541 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
542 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
544 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
545 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
547 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
548 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
549 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
550 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
551 ; GFX90A-TGSPLIT-NEXT: s_nop 0
552 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[4:5], 0x8
553 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
554 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
555 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
556 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
558 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
559 ; GFX90A-TGSPLIT-NEXT: s_endpgm
561 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
562 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
563 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
564 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
565 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
566 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[0:1], 0x8
567 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
569 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
570 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
571 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
572 ; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
573 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
575 ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0:
576 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
577 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
578 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
579 ; GFX940-TGSPLIT-NEXT: s_nop 0
580 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[0:1], 0x8
581 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
582 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
583 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
584 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
585 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
586 ; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
587 ; GFX940-TGSPLIT-NEXT: s_endpgm
589 ; GFX11-WGP-LABEL: local_nontemporal_store_0:
590 ; GFX11-WGP: ; %bb.0: ; %entry
591 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
592 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
593 ; GFX11-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8
594 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
595 ; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
596 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
597 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
599 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
600 ; GFX11-WGP-NEXT: s_endpgm
602 ; GFX11-CU-LABEL: local_nontemporal_store_0:
603 ; GFX11-CU: ; %bb.0: ; %entry
604 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
605 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
606 ; GFX11-CU-NEXT: s_load_b32 s1, s[0:1], 0x8
607 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
608 ; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
609 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
610 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
611 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
612 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
613 ; GFX11-CU-NEXT: s_endpgm
615 ; GFX12-WGP-LABEL: local_nontemporal_store_0:
616 ; GFX12-WGP: ; %bb.0: ; %entry
617 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
618 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
619 ; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8
620 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
621 ; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
622 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
623 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
624 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
625 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1
626 ; GFX12-WGP-NEXT: s_endpgm
628 ; GFX12-CU-LABEL: local_nontemporal_store_0:
629 ; GFX12-CU: ; %bb.0: ; %entry
630 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
631 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
632 ; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8
633 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
634 ; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
635 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
636 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
637 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
638 ; GFX12-CU-NEXT: ds_store_b32 v0, v1
639 ; GFX12-CU-NEXT: s_endpgm
640 ptr addrspace(1) %in, ptr addrspace(3) %out) {
642 %val = load i32, ptr addrspace(1) %in, align 4
643 store i32 %val, ptr addrspace(3) %out, !nontemporal !0
647 define amdgpu_kernel void @local_nontemporal_store_1(
648 ; GFX6-LABEL: local_nontemporal_store_1:
649 ; GFX6: ; %bb.0: ; %entry
650 ; GFX6-NEXT: s_load_dword s5, s[6:7], 0x2
651 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5
652 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
653 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
654 ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
655 ; GFX6-NEXT: s_mov_b32 s6, 2
656 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s6, v0
657 ; GFX6-NEXT: v_add_i32_e64 v0, s[6:7], s5, v0
658 ; GFX6-NEXT: s_mov_b32 m0, -1
659 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
660 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
661 ; GFX6-NEXT: ds_write_b32 v0, v1
662 ; GFX6-NEXT: s_endpgm
664 ; GFX7-LABEL: local_nontemporal_store_1:
665 ; GFX7: ; %bb.0: ; %entry
666 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
667 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
668 ; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2
669 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
671 ; GFX7-NEXT: s_mov_b32 s6, 2
672 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s6, v0
673 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s5, v0
674 ; GFX7-NEXT: s_mov_b32 m0, -1
675 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
676 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
677 ; GFX7-NEXT: ds_write_b32 v0, v1
678 ; GFX7-NEXT: s_endpgm
680 ; GFX10-WGP-LABEL: local_nontemporal_store_1:
681 ; GFX10-WGP: ; %bb.0: ; %entry
682 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
683 ; GFX10-WGP-NEXT: s_nop 0
684 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
685 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0
687 ; GFX10-WGP-NEXT: s_mov_b32 s5, 2
688 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6
689 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
690 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
691 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
692 ; GFX10-WGP-NEXT: s_endpgm
694 ; GFX10-CU-LABEL: local_nontemporal_store_1:
695 ; GFX10-CU: ; %bb.0: ; %entry
696 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
697 ; GFX10-CU-NEXT: s_nop 0
698 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
699 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
700 ; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0
701 ; GFX10-CU-NEXT: s_mov_b32 s5, 2
702 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6
703 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
704 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
705 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
706 ; GFX10-CU-NEXT: s_endpgm
708 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
709 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
710 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
711 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
712 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2
713 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
714 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
715 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2
716 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s2, v0
717 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[2:3], s1, v0
718 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
719 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
720 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
721 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
722 ; SKIP-CACHE-INV-NEXT: s_endpgm
724 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
725 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
726 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
727 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
728 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
729 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
731 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
732 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
733 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2
734 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
735 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1
736 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
737 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
738 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
739 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
741 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
742 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
743 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
744 ; GFX90A-TGSPLIT-NEXT: s_nop 0
745 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
746 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
747 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
748 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
749 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
750 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2
751 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
752 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1
753 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
754 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
755 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
756 ; GFX90A-TGSPLIT-NEXT: s_endpgm
758 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
759 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
760 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
761 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
762 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
763 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
764 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
765 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
766 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
767 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2
768 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
769 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1
770 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
771 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
772 ; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
773 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
775 ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1:
776 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
777 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
778 ; GFX940-TGSPLIT-NEXT: s_nop 0
779 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
780 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
781 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
782 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
783 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
784 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 2
785 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
786 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1
787 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
789 ; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
790 ; GFX940-TGSPLIT-NEXT: s_endpgm
792 ; GFX11-WGP-LABEL: local_nontemporal_store_1:
793 ; GFX11-WGP: ; %bb.0: ; %entry
794 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
795 ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
796 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
797 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
798 ; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff
799 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1
800 ; GFX11-WGP-NEXT: s_mov_b32 s1, 2
801 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
802 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
803 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
804 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
805 ; GFX11-WGP-NEXT: s_endpgm
807 ; GFX11-CU-LABEL: local_nontemporal_store_1:
808 ; GFX11-CU: ; %bb.0: ; %entry
809 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
810 ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
811 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
812 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
813 ; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff
814 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1
815 ; GFX11-CU-NEXT: s_mov_b32 s1, 2
816 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
817 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
818 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
819 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
820 ; GFX11-CU-NEXT: s_endpgm
822 ; GFX12-WGP-LABEL: local_nontemporal_store_1:
823 ; GFX12-WGP: ; %bb.0: ; %entry
824 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
825 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
826 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
827 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
828 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
829 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
830 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2
831 ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
832 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
833 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
834 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1
835 ; GFX12-WGP-NEXT: s_endpgm
837 ; GFX12-CU-LABEL: local_nontemporal_store_1:
838 ; GFX12-CU: ; %bb.0: ; %entry
839 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
840 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
841 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
842 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
843 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
844 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
845 ; GFX12-CU-NEXT: s_mov_b32 s1, 2
846 ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
847 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
848 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
849 ; GFX12-CU-NEXT: ds_store_b32 v0, v1
850 ; GFX12-CU-NEXT: s_endpgm
851 ptr addrspace(1) %in, ptr addrspace(3) %out) {
853 %tid = call i32 @llvm.amdgcn.workitem.id.x()
854 %val = load i32, ptr addrspace(1) %in, align 4
855 %out.gep = getelementptr inbounds i32, ptr addrspace(3) %out, i32 %tid
856 store i32 %val, ptr addrspace(3) %out.gep, !nontemporal !0
860 define amdgpu_kernel void @local_nontemporal_volatile_load(
861 ; GFX6-LABEL: local_nontemporal_volatile_load:
862 ; GFX6: ; %bb.0: ; %entry
863 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0
864 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr8
865 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
866 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
867 ; GFX6-NEXT: s_mov_b32 s11, s5
868 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
869 ; GFX6-NEXT: s_mov_b32 s9, 0x100f000
870 ; GFX6-NEXT: s_mov_b32 s10, -1
871 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
872 ; GFX6-NEXT: s_mov_b32 s5, s11
873 ; GFX6-NEXT: s_mov_b32 s6, s10
874 ; GFX6-NEXT: s_mov_b32 s7, s9
875 ; GFX6-NEXT: s_mov_b32 m0, -1
876 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
877 ; GFX6-NEXT: ds_read_b32 v0, v0
878 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
879 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
880 ; GFX6-NEXT: s_endpgm
882 ; GFX7-LABEL: local_nontemporal_volatile_load:
883 ; GFX7: ; %bb.0: ; %entry
884 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
885 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
886 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
887 ; GFX7-NEXT: s_mov_b32 m0, -1
888 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
889 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
890 ; GFX7-NEXT: ds_read_b32 v2, v0
891 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
892 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
893 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
894 ; GFX7-NEXT: flat_store_dword v[0:1], v2
895 ; GFX7-NEXT: s_endpgm
897 ; GFX10-WGP-LABEL: local_nontemporal_volatile_load:
898 ; GFX10-WGP: ; %bb.0: ; %entry
899 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
900 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0
901 ; GFX10-WGP-NEXT: s_nop 0
902 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
903 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
904 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
905 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
906 ; GFX10-WGP-NEXT: ds_read_b32 v1, v1
907 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
908 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
909 ; GFX10-WGP-NEXT: s_endpgm
911 ; GFX10-CU-LABEL: local_nontemporal_volatile_load:
912 ; GFX10-CU: ; %bb.0: ; %entry
913 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
914 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0
915 ; GFX10-CU-NEXT: s_nop 0
916 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
917 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
918 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
919 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
920 ; GFX10-CU-NEXT: ds_read_b32 v1, v1
921 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
922 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
923 ; GFX10-CU-NEXT: s_endpgm
925 ; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load:
926 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
927 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0
928 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
929 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
930 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
931 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
932 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
933 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
934 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
935 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
936 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
937 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
938 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
939 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
940 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
941 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
942 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
943 ; SKIP-CACHE-INV-NEXT: s_endpgm
945 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
946 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
947 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
948 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0
949 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
950 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
951 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
952 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
953 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
954 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
955 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
956 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
957 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
959 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load:
960 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
961 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
962 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0
963 ; GFX90A-TGSPLIT-NEXT: s_nop 0
964 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
965 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
966 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
967 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
968 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
969 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
970 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
971 ; GFX90A-TGSPLIT-NEXT: s_endpgm
973 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
974 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
975 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
976 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0
977 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
978 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
979 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
980 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
981 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
982 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
983 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
985 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
987 ; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load:
988 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
989 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
990 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0
991 ; GFX940-TGSPLIT-NEXT: s_nop 0
992 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
993 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
994 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
995 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
996 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v1
997 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
998 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
999 ; GFX940-TGSPLIT-NEXT: s_endpgm
1001 ; GFX11-WGP-LABEL: local_nontemporal_volatile_load:
1002 ; GFX11-WGP: ; %bb.0: ; %entry
1003 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
1004 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
1005 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1006 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
1007 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1008 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
1009 ; GFX11-WGP-NEXT: ds_load_b32 v1, v1
1010 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1011 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
1012 ; GFX11-WGP-NEXT: s_endpgm
1014 ; GFX11-CU-LABEL: local_nontemporal_volatile_load:
1015 ; GFX11-CU: ; %bb.0: ; %entry
1016 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
1017 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
1018 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1019 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
1020 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1021 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
1022 ; GFX11-CU-NEXT: ds_load_b32 v1, v1
1023 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1024 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
1025 ; GFX11-CU-NEXT: s_endpgm
1027 ; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
1028 ; GFX12-WGP: ; %bb.0: ; %entry
1029 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
1030 ; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
1031 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1032 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
1033 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1034 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
1035 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1
1036 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
1037 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
1038 ; GFX12-WGP-NEXT: s_endpgm
1040 ; GFX12-CU-LABEL: local_nontemporal_volatile_load:
1041 ; GFX12-CU: ; %bb.0: ; %entry
1042 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
1043 ; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
1044 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1045 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
1046 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1047 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
1048 ; GFX12-CU-NEXT: ds_load_b32 v1, v1
1049 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1050 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
1051 ; GFX12-CU-NEXT: s_endpgm
1052 ptr addrspace(3) %in, ptr addrspace(1) %out) {
1054 %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0
1055 store i32 %val, ptr addrspace(1) %out
1060 declare i32 @llvm.amdgcn.workitem.id.x()