1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
11 define amdgpu_kernel void @flat_nontemporal_load_0(
12 ; GFX7-LABEL: flat_nontemporal_load_0:
13 ; GFX7: ; %bb.0: ; %entry
14 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
15 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
16 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
17 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
19 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
20 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
21 ; GFX7-NEXT: s_waitcnt vmcnt(0)
22 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
23 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
24 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX7-NEXT: flat_store_dword v[0:1], v2
28 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
29 ; GFX10-WGP: ; %bb.0: ; %entry
30 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
31 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
32 ; GFX10-WGP-NEXT: s_nop 0
33 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
34 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
36 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
37 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
38 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
39 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
40 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
41 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
43 ; GFX10-WGP-NEXT: s_endpgm
45 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
46 ; GFX10-CU: ; %bb.0: ; %entry
47 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
48 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
49 ; GFX10-CU-NEXT: s_nop 0
50 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
51 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
53 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
54 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
55 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
56 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
57 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
58 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
60 ; GFX10-CU-NEXT: s_endpgm
62 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
63 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
64 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
65 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
66 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
67 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
68 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
69 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
70 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
71 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
72 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
73 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
74 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
75 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
76 ; SKIP-CACHE-INV-NEXT: s_endpgm
78 ; GFX11-WGP-LABEL: flat_nontemporal_load_0:
79 ; GFX11-WGP: ; %bb.0: ; %entry
80 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
81 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
82 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
83 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
84 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
85 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
86 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
87 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
88 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
89 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
90 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
91 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
92 ; GFX11-WGP-NEXT: s_endpgm
94 ; GFX11-CU-LABEL: flat_nontemporal_load_0:
95 ; GFX11-CU: ; %bb.0: ; %entry
96 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
97 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
98 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
99 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
101 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
102 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
103 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
104 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
105 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
106 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
108 ; GFX11-CU-NEXT: s_endpgm
110 ; GFX12-WGP-LABEL: flat_nontemporal_load_0:
111 ; GFX12-WGP: ; %bb.0: ; %entry
112 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
113 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
114 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
115 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
116 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
117 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
118 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
119 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
120 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
121 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
122 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
123 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
124 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
125 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
126 ; GFX12-WGP-NEXT: s_endpgm
128 ; GFX12-CU-LABEL: flat_nontemporal_load_0:
129 ; GFX12-CU: ; %bb.0: ; %entry
130 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
131 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
132 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
133 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
134 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
135 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
136 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
137 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
138 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
139 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
140 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
141 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
142 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
143 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
144 ; GFX12-CU-NEXT: s_endpgm
147 %val = load volatile i32, ptr %in, align 4
148 store i32 %val, ptr %out
152 define amdgpu_kernel void @flat_nontemporal_load_1(
153 ; GFX7-LABEL: flat_nontemporal_load_1:
154 ; GFX7: ; %bb.0: ; %entry
155 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
156 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
157 ; GFX7-NEXT: s_mov_b32 s6, 2
158 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
159 ; GFX7-NEXT: s_mov_b32 s6, 0
160 ; GFX7-NEXT: ; implicit-def: $sgpr6
161 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
162 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
163 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
164 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX7-NEXT: s_mov_b32 s6, s8
166 ; GFX7-NEXT: v_mov_b32_e32 v0, v1
167 ; GFX7-NEXT: s_mov_b32 s8, s9
168 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
169 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
170 ; GFX7-NEXT: v_mov_b32_e32 v1, s8
171 ; GFX7-NEXT: v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
172 ; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
173 ; GFX7-NEXT: v_mov_b32_e32 v1, v2
174 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
175 ; GFX7-NEXT: s_waitcnt vmcnt(0)
176 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
177 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
178 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX7-NEXT: flat_store_dword v[0:1], v2
180 ; GFX7-NEXT: s_endpgm
182 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
183 ; GFX10-WGP: ; %bb.0: ; %entry
184 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
185 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8
186 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2
187 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0
188 ; GFX10-WGP-NEXT: s_mov_b32 s6, 0
189 ; GFX10-WGP-NEXT: ; implicit-def: $sgpr6
190 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
191 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
192 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0
193 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX10-WGP-NEXT: s_mov_b32 s7, s8
195 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v1
196 ; GFX10-WGP-NEXT: s_mov_b32 s6, s9
197 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2
198 ; GFX10-WGP-NEXT: v_add_co_u32 v0, s7, s7, v0
199 ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
200 ; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
201 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2
202 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
203 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
204 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
205 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
206 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
208 ; GFX10-WGP-NEXT: s_endpgm
210 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
211 ; GFX10-CU: ; %bb.0: ; %entry
212 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
213 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8
214 ; GFX10-CU-NEXT: s_mov_b32 s6, 2
215 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0
216 ; GFX10-CU-NEXT: s_mov_b32 s6, 0
217 ; GFX10-CU-NEXT: ; implicit-def: $sgpr6
218 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
219 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
220 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0
221 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX10-CU-NEXT: s_mov_b32 s7, s8
223 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v1
224 ; GFX10-CU-NEXT: s_mov_b32 s6, s9
225 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2
226 ; GFX10-CU-NEXT: v_add_co_u32 v0, s7, s7, v0
227 ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
228 ; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
229 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2
230 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
231 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
232 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
233 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
234 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
235 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
236 ; GFX10-CU-NEXT: s_endpgm
238 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
239 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
240 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
241 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
242 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2
243 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0
244 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0
245 ; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr2
246 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
247 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
248 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
249 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
250 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s4
251 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v1
252 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s5
253 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
254 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[2:3], s2, v0
255 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
256 ; SKIP-CACHE-INV-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
257 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
258 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
259 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
260 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
261 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
262 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
263 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
264 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
265 ; SKIP-CACHE-INV-NEXT: s_endpgm
267 ; GFX11-WGP-LABEL: flat_nontemporal_load_1:
268 ; GFX11-WGP: ; %bb.0: ; %entry
269 ; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
270 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
271 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
272 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2
273 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2
274 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
275 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0
276 ; GFX11-WGP-NEXT: ; implicit-def: $sgpr2
277 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
279 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0
280 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX11-WGP-NEXT: s_mov_b32 s3, s4
282 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v1
283 ; GFX11-WGP-NEXT: s_mov_b32 s2, s5
284 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2
285 ; GFX11-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
286 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
287 ; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
288 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2
289 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
290 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
291 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
292 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
293 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
295 ; GFX11-WGP-NEXT: s_endpgm
297 ; GFX11-CU-LABEL: flat_nontemporal_load_1:
298 ; GFX11-CU: ; %bb.0: ; %entry
299 ; GFX11-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
300 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
301 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
302 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2
303 ; GFX11-CU-NEXT: s_mov_b32 s2, 2
304 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
305 ; GFX11-CU-NEXT: s_mov_b32 s2, 0
306 ; GFX11-CU-NEXT: ; implicit-def: $sgpr2
307 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
308 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
309 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0
310 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
311 ; GFX11-CU-NEXT: s_mov_b32 s3, s4
312 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v1
313 ; GFX11-CU-NEXT: s_mov_b32 s2, s5
314 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2
315 ; GFX11-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
316 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
317 ; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
318 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2
319 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
320 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
321 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
322 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
323 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
324 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
325 ; GFX11-CU-NEXT: s_endpgm
327 ; GFX12-WGP-LABEL: flat_nontemporal_load_1:
328 ; GFX12-WGP: ; %bb.0: ; %entry
329 ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
330 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
331 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
332 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
333 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2
334 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
335 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0
336 ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
337 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
338 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
339 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
340 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
341 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4
342 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
343 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5
344 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
345 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
346 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
347 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
348 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
349 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
350 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
351 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
352 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
353 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
354 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
355 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
356 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
357 ; GFX12-WGP-NEXT: s_endpgm
359 ; GFX12-CU-LABEL: flat_nontemporal_load_1:
360 ; GFX12-CU: ; %bb.0: ; %entry
361 ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
362 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
363 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
364 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
365 ; GFX12-CU-NEXT: s_mov_b32 s2, 2
366 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
367 ; GFX12-CU-NEXT: s_mov_b32 s2, 0
368 ; GFX12-CU-NEXT: ; implicit-def: $sgpr2
369 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
370 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
371 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
372 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
373 ; GFX12-CU-NEXT: s_mov_b32 s3, s4
374 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
375 ; GFX12-CU-NEXT: s_mov_b32 s2, s5
376 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
377 ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
378 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
379 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
380 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
381 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
382 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
383 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
384 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
385 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
386 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
387 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
388 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
389 ; GFX12-CU-NEXT: s_endpgm
392 %tid = call i32 @llvm.amdgcn.workitem.id.x()
393 %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
394 %val = load volatile i32, ptr %val.gep, align 4
395 store i32 %val, ptr %out
399 define amdgpu_kernel void @flat_nontemporal_store_0(
400 ; GFX7-LABEL: flat_nontemporal_store_0:
401 ; GFX7: ; %bb.0: ; %entry
402 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
403 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
404 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
405 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
406 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
407 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
408 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
409 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
410 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
411 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
412 ; GFX7-NEXT: flat_store_dword v[0:1], v2
413 ; GFX7-NEXT: s_waitcnt vmcnt(0)
414 ; GFX7-NEXT: s_endpgm
416 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
417 ; GFX10-WGP: ; %bb.0: ; %entry
418 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
419 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
420 ; GFX10-WGP-NEXT: s_nop 0
421 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
422 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
423 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
424 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
425 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
426 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
427 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
428 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
429 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
430 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
431 ; GFX10-WGP-NEXT: s_endpgm
433 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
434 ; GFX10-CU: ; %bb.0: ; %entry
435 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
436 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
437 ; GFX10-CU-NEXT: s_nop 0
438 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
439 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
441 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
442 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
443 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
444 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
445 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
446 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
447 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
448 ; GFX10-CU-NEXT: s_endpgm
450 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
451 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
452 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
453 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
454 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
455 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
456 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
457 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
458 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
459 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
460 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
461 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
462 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
463 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
464 ; SKIP-CACHE-INV-NEXT: s_endpgm
466 ; GFX11-WGP-LABEL: flat_nontemporal_store_0:
467 ; GFX11-WGP: ; %bb.0: ; %entry
468 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
469 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
470 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
471 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
473 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
474 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
475 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
476 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
477 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
478 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
479 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
480 ; GFX11-WGP-NEXT: s_endpgm
482 ; GFX11-CU-LABEL: flat_nontemporal_store_0:
483 ; GFX11-CU: ; %bb.0: ; %entry
484 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
485 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
486 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
487 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
489 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
490 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
491 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
492 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
493 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
494 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
495 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
496 ; GFX11-CU-NEXT: s_endpgm
498 ; GFX12-WGP-LABEL: flat_nontemporal_store_0:
499 ; GFX12-WGP: ; %bb.0: ; %entry
500 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
501 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
502 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
503 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
504 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
505 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
506 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
507 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
508 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
509 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
510 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
511 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
512 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
513 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
514 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
515 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
516 ; GFX12-WGP-NEXT: s_endpgm
518 ; GFX12-CU-LABEL: flat_nontemporal_store_0:
519 ; GFX12-CU: ; %bb.0: ; %entry
520 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
521 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
522 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
523 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
524 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
525 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
526 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
527 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
528 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
529 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
530 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
531 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
532 ; GFX12-CU-NEXT: s_wait_storecnt 0x0
533 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
534 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
535 ; GFX12-CU-NEXT: s_wait_storecnt 0x0
536 ; GFX12-CU-NEXT: s_endpgm
539 %val = load i32, ptr %in, align 4
540 store volatile i32 %val, ptr %out
544 define amdgpu_kernel void @flat_nontemporal_store_1(
545 ; GFX7-LABEL: flat_nontemporal_store_1:
546 ; GFX7: ; %bb.0: ; %entry
547 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
548 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
549 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
551 ; GFX7-NEXT: v_mov_b32_e32 v2, s5
552 ; GFX7-NEXT: flat_load_dword v2, v[1:2]
553 ; GFX7-NEXT: s_mov_b32 s4, 2
554 ; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0
555 ; GFX7-NEXT: s_mov_b32 s4, 0
556 ; GFX7-NEXT: ; implicit-def: $sgpr4
557 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
558 ; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
559 ; GFX7-NEXT: v_mov_b32_e32 v4, v0
560 ; GFX7-NEXT: s_mov_b32 s4, s6
561 ; GFX7-NEXT: v_mov_b32_e32 v0, v3
562 ; GFX7-NEXT: s_mov_b32 s6, s7
563 ; GFX7-NEXT: v_mov_b32_e32 v3, v4
564 ; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0
565 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
566 ; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v1, v3, s[4:5]
567 ; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
568 ; GFX7-NEXT: v_mov_b32_e32 v1, v3
569 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
570 ; GFX7-NEXT: flat_store_dword v[0:1], v2
571 ; GFX7-NEXT: s_waitcnt vmcnt(0)
572 ; GFX7-NEXT: s_endpgm
574 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
575 ; GFX10-WGP: ; %bb.0: ; %entry
576 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
577 ; GFX10-WGP-NEXT: s_nop 0
578 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
579 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
581 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5
582 ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
583 ; GFX10-WGP-NEXT: s_mov_b32 s4, 2
584 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0
585 ; GFX10-WGP-NEXT: s_mov_b32 s4, 0
586 ; GFX10-WGP-NEXT: ; implicit-def: $sgpr4
587 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
588 ; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
589 ; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0
590 ; GFX10-WGP-NEXT: s_mov_b32 s5, s6
591 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v3
592 ; GFX10-WGP-NEXT: s_mov_b32 s4, s7
593 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v4
594 ; GFX10-WGP-NEXT: v_add_co_u32 v0, s5, s5, v0
595 ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
596 ; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
597 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
598 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
599 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
600 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
601 ; GFX10-WGP-NEXT: s_endpgm
603 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
604 ; GFX10-CU: ; %bb.0: ; %entry
605 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
606 ; GFX10-CU-NEXT: s_nop 0
607 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
608 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
609 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
610 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
611 ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
612 ; GFX10-CU-NEXT: s_mov_b32 s4, 2
613 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0
614 ; GFX10-CU-NEXT: s_mov_b32 s4, 0
615 ; GFX10-CU-NEXT: ; implicit-def: $sgpr4
616 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
617 ; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
618 ; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0
619 ; GFX10-CU-NEXT: s_mov_b32 s5, s6
620 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v3
621 ; GFX10-CU-NEXT: s_mov_b32 s4, s7
622 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v4
623 ; GFX10-CU-NEXT: v_add_co_u32 v0, s5, s5, v0
624 ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
625 ; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
626 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
627 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
628 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
629 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
630 ; GFX10-CU-NEXT: s_endpgm
632 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
633 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
634 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
635 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
636 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
637 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
638 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
639 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2]
640 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2
641 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0
642 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0
643 ; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr0
644 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
645 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
646 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0
647 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s2
648 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v3
649 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s3
650 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v4
651 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0
652 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2
653 ; SKIP-CACHE-INV-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1]
654 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
655 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v3
656 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
657 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
658 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
659 ; SKIP-CACHE-INV-NEXT: s_endpgm
661 ; GFX11-WGP-LABEL: flat_nontemporal_store_1:
662 ; GFX11-WGP: ; %bb.0: ; %entry
663 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
664 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
665 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
666 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
667 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
668 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
669 ; GFX11-WGP-NEXT: s_mov_b32 s0, 0x3ff
670 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0
671 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2
672 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
673 ; GFX11-WGP-NEXT: s_mov_b32 s0, 0
674 ; GFX11-WGP-NEXT: ; implicit-def: $sgpr0
675 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
676 ; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
677 ; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0
678 ; GFX11-WGP-NEXT: s_mov_b32 s1, s2
679 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v3
680 ; GFX11-WGP-NEXT: s_mov_b32 s0, s3
681 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v4
682 ; GFX11-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
683 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
684 ; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
685 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
686 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
687 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
688 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
689 ; GFX11-WGP-NEXT: s_endpgm
691 ; GFX11-CU-LABEL: flat_nontemporal_store_1:
692 ; GFX11-CU: ; %bb.0: ; %entry
693 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
694 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
695 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
696 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
697 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
698 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
699 ; GFX11-CU-NEXT: s_mov_b32 s0, 0x3ff
700 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0
701 ; GFX11-CU-NEXT: s_mov_b32 s0, 2
702 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
703 ; GFX11-CU-NEXT: s_mov_b32 s0, 0
704 ; GFX11-CU-NEXT: ; implicit-def: $sgpr0
705 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
706 ; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
707 ; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0
708 ; GFX11-CU-NEXT: s_mov_b32 s1, s2
709 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v3
710 ; GFX11-CU-NEXT: s_mov_b32 s0, s3
711 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v4
712 ; GFX11-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
713 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
714 ; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
715 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
716 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
717 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
718 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
719 ; GFX11-CU-NEXT: s_endpgm
721 ; GFX12-WGP-LABEL: flat_nontemporal_store_1:
722 ; GFX12-WGP: ; %bb.0: ; %entry
723 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
724 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
725 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
726 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
727 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
728 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
729 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
730 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
731 ; GFX12-WGP-NEXT: s_mov_b32 s0, 2
732 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
733 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0
734 ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
735 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
736 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
737 ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
738 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2
739 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
740 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3
741 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
742 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
743 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
744 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
745 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
746 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
747 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
748 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
749 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
750 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
751 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
752 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
753 ; GFX12-WGP-NEXT: s_endpgm
755 ; GFX12-CU-LABEL: flat_nontemporal_store_1:
756 ; GFX12-CU: ; %bb.0: ; %entry
757 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
758 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
759 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
760 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
761 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
762 ; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
763 ; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
764 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
765 ; GFX12-CU-NEXT: s_mov_b32 s0, 2
766 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
767 ; GFX12-CU-NEXT: s_mov_b32 s0, 0
768 ; GFX12-CU-NEXT: ; implicit-def: $sgpr0
769 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
770 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
771 ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
772 ; GFX12-CU-NEXT: s_mov_b32 s1, s2
773 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
774 ; GFX12-CU-NEXT: s_mov_b32 s0, s3
775 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
776 ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
777 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
778 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
779 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
780 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
781 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
782 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
783 ; GFX12-CU-NEXT: s_wait_storecnt 0x0
784 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
785 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
786 ; GFX12-CU-NEXT: s_wait_storecnt 0x0
787 ; GFX12-CU-NEXT: s_endpgm
790 %tid = call i32 @llvm.amdgcn.workitem.id.x()
791 %val = load i32, ptr %in, align 4
792 %out.gep = getelementptr inbounds i32, ptr %out, i32 %tid
793 store volatile i32 %val, ptr %out.gep
797 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
798 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
799 ; GFX7: ; %bb.0: ; %entry
800 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
801 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
802 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
803 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
804 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
805 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
806 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
807 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
808 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
809 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
810 ; GFX7-NEXT: s_waitcnt vmcnt(0)
811 ; GFX7-NEXT: flat_store_dword v[0:1], v2
812 ; GFX7-NEXT: s_endpgm
814 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
815 ; GFX10-WGP: ; %bb.0: ; %entry
816 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
817 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
818 ; GFX10-WGP-NEXT: s_nop 0
819 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
820 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
821 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
822 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
823 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
824 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
825 ; GFX10-WGP-NEXT: buffer_gl0_inv
826 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
827 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
828 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
829 ; GFX10-WGP-NEXT: s_endpgm
831 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
832 ; GFX10-CU: ; %bb.0: ; %entry
833 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
834 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
835 ; GFX10-CU-NEXT: s_nop 0
836 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
837 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
838 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
839 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
840 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
841 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
842 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
843 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
844 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
845 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
846 ; GFX10-CU-NEXT: s_endpgm
848 ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load:
849 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
850 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
851 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
852 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
853 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
854 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
855 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
856 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
857 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
858 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
859 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
860 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
861 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
862 ; SKIP-CACHE-INV-NEXT: s_endpgm
864 ; GFX11-WGP-LABEL: flat_volatile_workgroup_acquire_load:
865 ; GFX11-WGP: ; %bb.0: ; %entry
866 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
867 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
868 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
869 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
870 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
871 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
872 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
873 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
874 ; GFX11-WGP-NEXT: buffer_gl0_inv
875 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
876 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
877 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
878 ; GFX11-WGP-NEXT: s_endpgm
880 ; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load:
881 ; GFX11-CU: ; %bb.0: ; %entry
882 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
883 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
884 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
885 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
886 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
887 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
888 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
889 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
890 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
891 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
892 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
893 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
894 ; GFX11-CU-NEXT: s_endpgm
896 ; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load:
897 ; GFX12-WGP: ; %bb.0: ; %entry
898 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
899 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
900 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
901 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
902 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
903 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
904 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
905 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
906 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
907 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
908 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
909 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
910 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
911 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
912 ; GFX12-WGP-NEXT: s_endpgm
914 ; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load:
915 ; GFX12-CU: ; %bb.0: ; %entry
916 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
917 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
918 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
919 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
920 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
921 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
922 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
923 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
924 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
925 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
926 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
927 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
928 ; GFX12-CU-NEXT: s_endpgm
931 %val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4
932 store i32 %val, ptr %out
936 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
937 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
938 ; GFX7: ; %bb.0: ; %entry
939 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
940 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
941 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
942 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
943 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
944 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
945 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
946 ; GFX7-NEXT: flat_store_dword v[0:1], v2
947 ; GFX7-NEXT: s_endpgm
949 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
950 ; GFX10-WGP: ; %bb.0: ; %entry
951 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
952 ; GFX10-WGP-NEXT: s_nop 0
953 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
954 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
955 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
956 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
957 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
958 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
960 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
961 ; GFX10-WGP-NEXT: s_endpgm
963 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
964 ; GFX10-CU: ; %bb.0: ; %entry
965 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
966 ; GFX10-CU-NEXT: s_nop 0
967 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
968 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
969 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
970 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
971 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
972 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
973 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
974 ; GFX10-CU-NEXT: s_endpgm
976 ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store:
977 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
978 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
979 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
980 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
981 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
982 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
983 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
984 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
985 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
986 ; SKIP-CACHE-INV-NEXT: s_endpgm
988 ; GFX11-WGP-LABEL: flat_volatile_workgroup_release_store:
989 ; GFX11-WGP: ; %bb.0: ; %entry
990 ; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
991 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
992 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
993 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
994 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
995 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
996 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
997 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
998 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
999 ; GFX11-WGP-NEXT: s_endpgm
1001 ; GFX11-CU-LABEL: flat_volatile_workgroup_release_store:
1002 ; GFX11-CU: ; %bb.0: ; %entry
1003 ; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
1004 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
1005 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1006 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1007 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1008 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1009 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1010 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
1011 ; GFX11-CU-NEXT: s_endpgm
1013 ; GFX12-WGP-LABEL: flat_volatile_workgroup_release_store:
1014 ; GFX12-WGP: ; %bb.0: ; %entry
1015 ; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
1016 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
1017 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1018 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1019 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1020 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1021 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
1022 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1023 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1024 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
1025 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
1026 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
1027 ; GFX12-WGP-NEXT: s_endpgm
1029 ; GFX12-CU-LABEL: flat_volatile_workgroup_release_store:
1030 ; GFX12-CU: ; %bb.0: ; %entry
1031 ; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
1032 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
1033 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1034 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1035 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1036 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1037 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1038 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
1039 ; GFX12-CU-NEXT: s_endpgm
1040 i32 %in, ptr %out) {
1042 store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
1046 declare i32 @llvm.amdgcn.workitem.id.x()