1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
16 define amdgpu_kernel void @global_nontemporal_load_0(
17 ; GFX6-LABEL: global_nontemporal_load_0:
18 ; GFX6: ; %bb.0: ; %entry
19 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
20 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
21 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX6-NEXT: s_mov_b32 s12, s5
23 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
24 ; GFX6-NEXT: s_mov_b32 s10, 0x100f000
25 ; GFX6-NEXT: s_mov_b32 s11, -1
26 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
27 ; GFX6-NEXT: s_mov_b32 s5, s12
28 ; GFX6-NEXT: s_mov_b32 s6, s11
29 ; GFX6-NEXT: s_mov_b32 s7, s10
30 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x0
31 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
32 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
33 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
36 ; GFX7-LABEL: global_nontemporal_load_0:
37 ; GFX7: ; %bb.0: ; %entry
38 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
39 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
40 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
42 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
43 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
44 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
45 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
46 ; GFX7-NEXT: flat_store_dword v[0:1], v2
49 ; GFX10-WGP-LABEL: global_nontemporal_load_0:
50 ; GFX10-WGP: ; %bb.0: ; %entry
51 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
52 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
53 ; GFX10-WGP-NEXT: s_nop 0
54 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
55 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
56 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0
58 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
60 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
61 ; GFX10-WGP-NEXT: s_endpgm
63 ; GFX10-CU-LABEL: global_nontemporal_load_0:
64 ; GFX10-CU: ; %bb.0: ; %entry
65 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
66 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
67 ; GFX10-CU-NEXT: s_nop 0
68 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
69 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
70 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0
72 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
74 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
75 ; GFX10-CU-NEXT: s_endpgm
77 ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0:
78 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
79 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
80 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
81 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
82 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
83 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
84 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
85 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
86 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
87 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
88 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
89 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
90 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x0
91 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
92 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
93 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
94 ; SKIP-CACHE-INV-NEXT: s_endpgm
96 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
97 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
98 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
99 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
100 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
101 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
102 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
103 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0
105 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
107 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
108 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
110 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_0:
111 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
112 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
113 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
114 ; GFX90A-TGSPLIT-NEXT: s_nop 0
115 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
116 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0
119 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
121 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
122 ; GFX90A-TGSPLIT-NEXT: s_endpgm
124 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
125 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
126 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
127 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
128 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
129 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
130 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
131 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
132 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0
133 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
135 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
136 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
138 ; GFX940-TGSPLIT-LABEL: global_nontemporal_load_0:
139 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
140 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
141 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
142 ; GFX940-TGSPLIT-NEXT: s_nop 0
143 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
144 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
145 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
146 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0
147 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
149 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
150 ; GFX940-TGSPLIT-NEXT: s_endpgm
152 ; GFX11-WGP-LABEL: global_nontemporal_load_0:
153 ; GFX11-WGP: ; %bb.0: ; %entry
154 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
155 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
156 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
157 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
158 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
160 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
161 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
162 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
163 ; GFX11-WGP-NEXT: s_endpgm
165 ; GFX11-CU-LABEL: global_nontemporal_load_0:
166 ; GFX11-CU: ; %bb.0: ; %entry
167 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
168 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
169 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
170 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
171 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
172 ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
173 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
175 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
176 ; GFX11-CU-NEXT: s_endpgm
178 ; GFX12-WGP-LABEL: global_nontemporal_load_0:
179 ; GFX12-WGP: ; %bb.0: ; %entry
180 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
181 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
182 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
183 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
184 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
185 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
186 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
187 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
188 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
189 ; GFX12-WGP-NEXT: s_endpgm
191 ; GFX12-CU-LABEL: global_nontemporal_load_0:
192 ; GFX12-CU: ; %bb.0: ; %entry
193 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
194 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
195 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
196 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
197 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
198 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
199 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
200 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
201 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
202 ; GFX12-CU-NEXT: s_endpgm
203 ptr addrspace(1) %in, ptr addrspace(1) %out) {
205 %val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0
206 store i32 %val, ptr addrspace(1) %out
210 define amdgpu_kernel void @global_nontemporal_load_1(
211 ; GFX6-LABEL: global_nontemporal_load_1:
212 ; GFX6: ; %bb.0: ; %entry
213 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
214 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
215 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX6-NEXT: s_mov_b32 s12, s5
217 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
218 ; GFX6-NEXT: s_mov_b32 s10, 0x100f000
219 ; GFX6-NEXT: s_mov_b32 s11, -1
220 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
221 ; GFX6-NEXT: s_mov_b32 s5, s12
222 ; GFX6-NEXT: s_mov_b32 s6, s11
223 ; GFX6-NEXT: s_mov_b32 s7, s10
224 ; GFX6-NEXT: s_mov_b32 s12, 0
225 ; GFX6-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13
226 ; GFX6-NEXT: s_mov_b32 s13, s10
227 ; GFX6-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11
228 ; GFX6-NEXT: s_mov_b64 s[10:11], s[12:13]
229 ; GFX6-NEXT: s_mov_b32 s12, 2
230 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s12, v0
231 ; GFX6-NEXT: s_mov_b32 s12, 0
232 ; GFX6-NEXT: ; implicit-def: $sgpr12
233 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
234 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
235 ; GFX6-NEXT: v_mov_b32_e32 v1, v2
236 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc slc
237 ; GFX6-NEXT: s_waitcnt vmcnt(0)
238 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
239 ; GFX6-NEXT: s_endpgm
241 ; GFX7-LABEL: global_nontemporal_load_1:
242 ; GFX7: ; %bb.0: ; %entry
243 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
244 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
245 ; GFX7-NEXT: s_mov_b32 s6, 2
246 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
247 ; GFX7-NEXT: s_mov_b32 s6, 0
248 ; GFX7-NEXT: ; implicit-def: $sgpr6
249 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
250 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
251 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
252 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX7-NEXT: s_mov_b32 s6, s8
254 ; GFX7-NEXT: v_mov_b32_e32 v0, v1
255 ; GFX7-NEXT: s_mov_b32 s8, s9
256 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
257 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
258 ; GFX7-NEXT: v_mov_b32_e32 v1, s8
259 ; GFX7-NEXT: v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
260 ; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
261 ; GFX7-NEXT: v_mov_b32_e32 v1, v2
262 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc
263 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
264 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
265 ; GFX7-NEXT: s_waitcnt vmcnt(0)
266 ; GFX7-NEXT: flat_store_dword v[0:1], v2
267 ; GFX7-NEXT: s_endpgm
269 ; GFX10-WGP-LABEL: global_nontemporal_load_1:
270 ; GFX10-WGP: ; %bb.0: ; %entry
271 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
272 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
273 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
274 ; GFX10-WGP-NEXT: s_nop 0
275 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
276 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
277 ; GFX10-WGP-NEXT: s_mov_b32 s8, 2
278 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s8, v1
279 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
280 ; GFX10-WGP-NEXT: global_load_dword v1, v1, s[6:7] slc
281 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
282 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
283 ; GFX10-WGP-NEXT: s_endpgm
285 ; GFX10-CU-LABEL: global_nontemporal_load_1:
286 ; GFX10-CU: ; %bb.0: ; %entry
287 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
288 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
289 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
290 ; GFX10-CU-NEXT: s_nop 0
291 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
292 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
293 ; GFX10-CU-NEXT: s_mov_b32 s8, 2
294 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s8, v1
295 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
296 ; GFX10-CU-NEXT: global_load_dword v1, v1, s[6:7] slc
297 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
298 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
299 ; GFX10-CU-NEXT: s_endpgm
301 ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1:
302 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
303 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
304 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
305 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
306 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
307 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
308 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
309 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
310 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
311 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
312 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
313 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
314 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0
315 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
316 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, s6
317 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
318 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9]
319 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2
320 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0
321 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0
322 ; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr8
323 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0
324 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
325 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
326 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc
327 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
328 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
329 ; SKIP-CACHE-INV-NEXT: s_endpgm
331 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
332 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
333 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
334 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
335 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
336 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
337 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
338 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
339 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, 0x3ff
340 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s8
341 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, 2
342 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s8, v1
343 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
344 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v1, s[6:7] glc slc
345 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
346 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
347 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
349 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_1:
350 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
351 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
352 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
353 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
354 ; GFX90A-TGSPLIT-NEXT: s_nop 0
355 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
356 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
357 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, 0x3ff
358 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s8
359 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, 2
360 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s8, v1
361 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
362 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v1, s[6:7] glc slc
363 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
364 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
365 ; GFX90A-TGSPLIT-NEXT: s_endpgm
367 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
368 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
369 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
370 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
371 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
372 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
373 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
374 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
375 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff
376 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s4
377 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2
378 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s4, v1
379 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v1, s[2:3] nt
381 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
382 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
383 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
385 ; GFX940-TGSPLIT-LABEL: global_nontemporal_load_1:
386 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
387 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
388 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
389 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
390 ; GFX940-TGSPLIT-NEXT: s_nop 0
391 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
392 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
393 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff
394 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s4
395 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 2
396 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e64 v1, s4, v1
397 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v1, s[2:3] nt
399 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
400 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
401 ; GFX940-TGSPLIT-NEXT: s_endpgm
403 ; GFX11-WGP-LABEL: global_nontemporal_load_1:
404 ; GFX11-WGP: ; %bb.0: ; %entry
405 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
406 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
407 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
408 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
409 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
410 ; GFX11-WGP-NEXT: s_mov_b32 s4, 0x3ff
411 ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s4
412 ; GFX11-WGP-NEXT: s_mov_b32 s4, 2
413 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1
414 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
415 ; GFX11-WGP-NEXT: global_load_b32 v1, v1, s[2:3] slc dlc
416 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
417 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
418 ; GFX11-WGP-NEXT: s_endpgm
420 ; GFX11-CU-LABEL: global_nontemporal_load_1:
421 ; GFX11-CU: ; %bb.0: ; %entry
422 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
423 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
424 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
425 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
426 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
427 ; GFX11-CU-NEXT: s_mov_b32 s4, 0x3ff
428 ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s4
429 ; GFX11-CU-NEXT: s_mov_b32 s4, 2
430 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1
431 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
432 ; GFX11-CU-NEXT: global_load_b32 v1, v1, s[2:3] slc dlc
433 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
434 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
435 ; GFX11-CU-NEXT: s_endpgm
437 ; GFX12-WGP-LABEL: global_nontemporal_load_1:
438 ; GFX12-WGP: ; %bb.0: ; %entry
439 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
440 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
441 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
442 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
443 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
444 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff
445 ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4
446 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2
447 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1
448 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
449 ; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
450 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
451 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
452 ; GFX12-WGP-NEXT: s_endpgm
454 ; GFX12-CU-LABEL: global_nontemporal_load_1:
455 ; GFX12-CU: ; %bb.0: ; %entry
456 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
457 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
458 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
459 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
460 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
461 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff
462 ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4
463 ; GFX12-CU-NEXT: s_mov_b32 s4, 2
464 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1
465 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
466 ; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
467 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
468 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
469 ; GFX12-CU-NEXT: s_endpgm
470 ptr addrspace(1) %in, ptr addrspace(1) %out) {
472 %tid = call i32 @llvm.amdgcn.workitem.id.x()
473 %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
474 %val = load i32, ptr addrspace(1) %val.gep, align 4, !nontemporal !0
475 store i32 %val, ptr addrspace(1) %out
479 define amdgpu_kernel void @global_nontemporal_store_0(
480 ; GFX6-LABEL: global_nontemporal_store_0:
481 ; GFX6: ; %bb.0: ; %entry
482 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
483 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
484 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX6-NEXT: s_mov_b32 s12, s5
486 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
487 ; GFX6-NEXT: s_mov_b32 s10, 0x100f000
488 ; GFX6-NEXT: s_mov_b32 s11, -1
489 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
490 ; GFX6-NEXT: s_mov_b32 s5, s12
491 ; GFX6-NEXT: s_mov_b32 s6, s11
492 ; GFX6-NEXT: s_mov_b32 s7, s10
493 ; GFX6-NEXT: s_load_dword s8, s[8:9], 0x0
494 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
496 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc
497 ; GFX6-NEXT: s_endpgm
499 ; GFX7-LABEL: global_nontemporal_store_0:
500 ; GFX7: ; %bb.0: ; %entry
501 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
502 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
503 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
505 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
506 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
507 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
509 ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
510 ; GFX7-NEXT: s_endpgm
512 ; GFX10-WGP-LABEL: global_nontemporal_store_0:
513 ; GFX10-WGP: ; %bb.0: ; %entry
514 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
515 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
516 ; GFX10-WGP-NEXT: s_nop 0
517 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
518 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
519 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
520 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0
521 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
522 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
523 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] glc slc
524 ; GFX10-WGP-NEXT: s_endpgm
526 ; GFX10-CU-LABEL: global_nontemporal_store_0:
527 ; GFX10-CU: ; %bb.0: ; %entry
528 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
529 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
530 ; GFX10-CU-NEXT: s_nop 0
531 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
532 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
533 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
534 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0
535 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
537 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] glc slc
538 ; GFX10-CU-NEXT: s_endpgm
540 ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:
541 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
542 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
543 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
544 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
545 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
546 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
547 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
548 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
549 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
550 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
551 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
552 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
553 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x0
554 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
555 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
556 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 glc slc
557 ; SKIP-CACHE-INV-NEXT: s_endpgm
559 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
560 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
561 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
562 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
563 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
564 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
565 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
566 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0
568 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
570 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] glc slc
571 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
573 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_0:
574 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
575 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
576 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
577 ; GFX90A-TGSPLIT-NEXT: s_nop 0
578 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
579 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
580 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
581 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0
582 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
583 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
584 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] glc slc
585 ; GFX90A-TGSPLIT-NEXT: s_endpgm
587 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
588 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
589 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
590 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
591 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
592 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
593 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
594 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
595 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0
596 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
598 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 nt sc1
599 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
601 ; GFX940-TGSPLIT-LABEL: global_nontemporal_store_0:
602 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
603 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
604 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
605 ; GFX940-TGSPLIT-NEXT: s_nop 0
606 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
607 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
608 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
609 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0
610 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
611 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
612 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 nt sc1
613 ; GFX940-TGSPLIT-NEXT: s_endpgm
615 ; GFX11-WGP-LABEL: global_nontemporal_store_0:
616 ; GFX11-WGP: ; %bb.0: ; %entry
617 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
618 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
619 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
620 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
621 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
622 ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
623 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
624 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
625 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] glc slc dlc
626 ; GFX11-WGP-NEXT: s_endpgm
628 ; GFX11-CU-LABEL: global_nontemporal_store_0:
629 ; GFX11-CU: ; %bb.0: ; %entry
630 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
631 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
632 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
633 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
634 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
635 ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
636 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
637 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
638 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] glc slc dlc
639 ; GFX11-CU-NEXT: s_endpgm
641 ; GFX12-WGP-LABEL: global_nontemporal_store_0:
642 ; GFX12-WGP: ; %bb.0: ; %entry
643 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
644 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
645 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
646 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
647 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
648 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
649 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
650 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
651 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
652 ; GFX12-WGP-NEXT: s_endpgm
654 ; GFX12-CU-LABEL: global_nontemporal_store_0:
655 ; GFX12-CU: ; %bb.0: ; %entry
656 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
657 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
658 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
659 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
660 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
661 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
662 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
663 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
664 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
665 ; GFX12-CU-NEXT: s_endpgm
666 ptr addrspace(1) %in, ptr addrspace(1) %out) {
668 %val = load i32, ptr addrspace(1) %in, align 4
669 store i32 %val, ptr addrspace(1) %out, !nontemporal !0
673 define amdgpu_kernel void @global_nontemporal_store_1(
674 ; GFX6-LABEL: global_nontemporal_store_1:
675 ; GFX6: ; %bb.0: ; %entry
676 ; GFX6-NEXT: s_mov_b64 s[4:5], s[6:7]
677 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
678 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
679 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
680 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0
681 ; GFX6-NEXT: s_mov_b32 s6, 0x100f000
682 ; GFX6-NEXT: s_mov_b32 s10, 0
683 ; GFX6-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
684 ; GFX6-NEXT: s_mov_b32 s11, s6
685 ; GFX6-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
686 ; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11]
687 ; GFX6-NEXT: s_mov_b32 s9, 2
688 ; GFX6-NEXT: v_lshlrev_b32_e64 v1, s9, v0
689 ; GFX6-NEXT: s_mov_b32 s9, 0
690 ; GFX6-NEXT: ; implicit-def: $sgpr9
691 ; GFX6-NEXT: v_mov_b32_e32 v0, 0
692 ; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
693 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
694 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
695 ; GFX6-NEXT: v_mov_b32_e32 v0, s8
696 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 glc slc
697 ; GFX6-NEXT: s_endpgm
699 ; GFX7-LABEL: global_nontemporal_store_1:
700 ; GFX7: ; %bb.0: ; %entry
701 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
702 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2
703 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
704 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
705 ; GFX7-NEXT: s_mov_b32 s5, 2
706 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0
707 ; GFX7-NEXT: s_mov_b32 s5, 0
708 ; GFX7-NEXT: ; implicit-def: $sgpr5
709 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
710 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
711 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
712 ; GFX7-NEXT: s_mov_b32 s6, s8
713 ; GFX7-NEXT: v_mov_b32_e32 v0, v1
714 ; GFX7-NEXT: s_mov_b32 s5, s9
715 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
716 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
717 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
718 ; GFX7-NEXT: v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
719 ; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
720 ; GFX7-NEXT: v_mov_b32_e32 v1, v2
721 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
723 ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
724 ; GFX7-NEXT: s_endpgm
726 ; GFX10-WGP-LABEL: global_nontemporal_store_1:
727 ; GFX10-WGP: ; %bb.0: ; %entry
728 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
729 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
730 ; GFX10-WGP-NEXT: s_nop 0
731 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
732 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
733 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0
734 ; GFX10-WGP-NEXT: s_mov_b32 s7, 2
735 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v0, s7, v0
736 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
737 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
738 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] glc slc
739 ; GFX10-WGP-NEXT: s_endpgm
741 ; GFX10-CU-LABEL: global_nontemporal_store_1:
742 ; GFX10-CU: ; %bb.0: ; %entry
743 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
744 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
745 ; GFX10-CU-NEXT: s_nop 0
746 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
747 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0
749 ; GFX10-CU-NEXT: s_mov_b32 s7, 2
750 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v0, s7, v0
751 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
752 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
753 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] glc slc
754 ; GFX10-CU-NEXT: s_endpgm
756 ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:
757 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
758 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
759 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
760 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
761 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
762 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0
763 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0xf000
764 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
765 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
766 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s2
767 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
768 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7]
769 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
770 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0
771 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0
772 ; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr5
773 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
774 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
775 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
776 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
777 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
778 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 glc slc
779 ; SKIP-CACHE-INV-NEXT: s_endpgm
781 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
782 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
783 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
784 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
785 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
786 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
787 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0
789 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
790 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s7
791 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 2
792 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s7, v0
793 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
794 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
795 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] glc slc
796 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
798 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_1:
799 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
800 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
801 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
802 ; GFX90A-TGSPLIT-NEXT: s_nop 0
803 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
804 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
805 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0
806 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
807 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s7
808 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 2
809 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s7, v0
810 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
811 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
812 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] glc slc
813 ; GFX90A-TGSPLIT-NEXT: s_endpgm
815 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
816 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
817 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
818 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
819 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
820 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
821 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
822 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0
823 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
824 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s3
825 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s3, 2
826 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s3, v0
827 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
828 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
829 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 nt sc1
830 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
832 ; GFX940-TGSPLIT-LABEL: global_nontemporal_store_1:
833 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
834 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
835 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
836 ; GFX940-TGSPLIT-NEXT: s_nop 0
837 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
838 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
839 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0
840 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
841 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s3
842 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s3, 2
843 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s3, v0
844 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
845 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
846 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 nt sc1
847 ; GFX940-TGSPLIT-NEXT: s_endpgm
849 ; GFX11-WGP-LABEL: global_nontemporal_store_1:
850 ; GFX11-WGP: ; %bb.0: ; %entry
851 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
852 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
853 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
854 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
855 ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
856 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
857 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s3
858 ; GFX11-WGP-NEXT: s_mov_b32 s3, 2
859 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0
860 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
861 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
862 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] glc slc dlc
863 ; GFX11-WGP-NEXT: s_endpgm
865 ; GFX11-CU-LABEL: global_nontemporal_store_1:
866 ; GFX11-CU: ; %bb.0: ; %entry
867 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
868 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
869 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
870 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
871 ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
872 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
873 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s3
874 ; GFX11-CU-NEXT: s_mov_b32 s3, 2
875 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0
876 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
877 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
878 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] glc slc dlc
879 ; GFX11-CU-NEXT: s_endpgm
881 ; GFX12-WGP-LABEL: global_nontemporal_store_1:
882 ; GFX12-WGP: ; %bb.0: ; %entry
883 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
884 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
885 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
886 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
887 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
888 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
889 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3
890 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2
891 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0
892 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
893 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
894 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
895 ; GFX12-WGP-NEXT: s_endpgm
897 ; GFX12-CU-LABEL: global_nontemporal_store_1:
898 ; GFX12-CU: ; %bb.0: ; %entry
899 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
900 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
901 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
902 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
903 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
904 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
905 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3
906 ; GFX12-CU-NEXT: s_mov_b32 s3, 2
907 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0
908 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
909 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
910 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
911 ; GFX12-CU-NEXT: s_endpgm
912 ptr addrspace(1) %in, ptr addrspace(1) %out) {
914 %tid = call i32 @llvm.amdgcn.workitem.id.x()
915 %val = load i32, ptr addrspace(1) %in, align 4
916 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
917 store i32 %val, ptr addrspace(1) %out.gep, !nontemporal !0
921 define amdgpu_kernel void @global_nontemporal_volatile_load(
922 ; GFX6-LABEL: global_nontemporal_volatile_load:
923 ; GFX6: ; %bb.0: ; %entry
924 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
925 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
926 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
927 ; GFX6-NEXT: s_mov_b32 s6, s9
928 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
929 ; GFX6-NEXT: s_mov_b32 s12, 0x100f000
930 ; GFX6-NEXT: s_mov_b32 s13, -1
931 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
932 ; GFX6-NEXT: s_mov_b32 s9, s6
933 ; GFX6-NEXT: s_mov_b32 s10, s13
934 ; GFX6-NEXT: s_mov_b32 s11, s12
935 ; GFX6-NEXT: s_mov_b32 s14, s5
936 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
937 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
938 ; GFX6-NEXT: s_mov_b32 s5, s14
939 ; GFX6-NEXT: s_mov_b32 s6, s13
940 ; GFX6-NEXT: s_mov_b32 s7, s12
941 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
942 ; GFX6-NEXT: s_waitcnt vmcnt(0)
943 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
944 ; GFX6-NEXT: s_endpgm
946 ; GFX7-LABEL: global_nontemporal_volatile_load:
947 ; GFX7: ; %bb.0: ; %entry
948 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
949 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
950 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
951 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
952 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
953 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
954 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
955 ; GFX7-NEXT: s_waitcnt vmcnt(0)
956 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
957 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
958 ; GFX7-NEXT: flat_store_dword v[0:1], v2
959 ; GFX7-NEXT: s_endpgm
961 ; GFX10-WGP-LABEL: global_nontemporal_volatile_load:
962 ; GFX10-WGP: ; %bb.0: ; %entry
963 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
964 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
965 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
966 ; GFX10-WGP-NEXT: s_nop 0
967 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
968 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
969 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
970 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
971 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
972 ; GFX10-WGP-NEXT: s_endpgm
974 ; GFX10-CU-LABEL: global_nontemporal_volatile_load:
975 ; GFX10-CU: ; %bb.0: ; %entry
976 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
977 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
978 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
979 ; GFX10-CU-NEXT: s_nop 0
980 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
981 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
982 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
983 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
984 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
985 ; GFX10-CU-NEXT: s_endpgm
987 ; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load:
988 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
989 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
990 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
991 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
992 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
993 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
994 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
995 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
996 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
997 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
998 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
999 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
1000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
1001 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1002 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1003 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
1004 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
1005 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
1006 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
1007 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
1008 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
1009 ; SKIP-CACHE-INV-NEXT: s_endpgm
1011 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
1012 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1013 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
1014 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
1015 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1016 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
1017 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
1018 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1019 ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
1020 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
1021 ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
1022 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1024 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load:
1025 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1026 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
1027 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
1028 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1029 ; GFX90A-TGSPLIT-NEXT: s_nop 0
1030 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
1031 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1032 ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
1033 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1034 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
1035 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1037 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
1038 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1039 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
1040 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
1041 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1042 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
1043 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
1044 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1045 ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
1046 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
1047 ; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
1048 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1050 ; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load:
1051 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1052 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
1053 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
1054 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1055 ; GFX940-TGSPLIT-NEXT: s_nop 0
1056 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
1057 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1058 ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
1059 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
1061 ; GFX940-TGSPLIT-NEXT: s_endpgm
1063 ; GFX11-WGP-LABEL: global_nontemporal_volatile_load:
1064 ; GFX11-WGP: ; %bb.0: ; %entry
1065 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
1066 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
1067 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1068 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1069 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
1071 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
1072 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
1073 ; GFX11-WGP-NEXT: s_endpgm
1075 ; GFX11-CU-LABEL: global_nontemporal_volatile_load:
1076 ; GFX11-CU: ; %bb.0: ; %entry
1077 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
1078 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
1079 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1080 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1081 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1082 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
1083 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
1084 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
1085 ; GFX11-CU-NEXT: s_endpgm
1087 ; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
1088 ; GFX12-WGP: ; %bb.0: ; %entry
1089 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
1090 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
1091 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1092 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1093 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1094 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
1095 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1096 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1097 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
1098 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
1099 ; GFX12-WGP-NEXT: s_endpgm
1101 ; GFX12-CU-LABEL: global_nontemporal_volatile_load:
1102 ; GFX12-CU: ; %bb.0: ; %entry
1103 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
1104 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
1105 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1106 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1107 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1108 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
1109 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
1110 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
1111 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
1112 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
1113 ; GFX12-CU-NEXT: s_endpgm
1114 ptr addrspace(1) %in, ptr addrspace(1) %out) {
1116 %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
1117 store i32 %val, ptr addrspace(1) %out
1122 declare i32 @llvm.amdgcn.workitem.id.x()