1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
13 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
15 define amdgpu_kernel void @flat_nontemporal_load_0(
16 ; GFX7-LABEL: flat_nontemporal_load_0:
17 ; GFX7: ; %bb.0: ; %entry
18 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
19 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
20 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
21 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
23 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
24 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc
25 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
26 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
27 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
28 ; GFX7-NEXT: flat_store_dword v[0:1], v2
31 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
32 ; GFX10-WGP: ; %bb.0: ; %entry
33 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
34 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
35 ; GFX10-WGP-NEXT: s_nop 0
36 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
37 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
38 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
39 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
40 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
41 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
42 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
43 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
44 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
45 ; GFX10-WGP-NEXT: s_endpgm
47 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
48 ; GFX10-CU: ; %bb.0: ; %entry
49 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
50 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
51 ; GFX10-CU-NEXT: s_nop 0
52 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
53 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
55 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
56 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
57 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
58 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
59 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
60 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
61 ; GFX10-CU-NEXT: s_endpgm
63 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
64 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
65 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
66 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
67 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
68 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
69 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
70 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
71 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc
72 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
73 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
74 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
75 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
76 ; SKIP-CACHE-INV-NEXT: s_endpgm
78 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
79 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
80 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
81 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
82 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
83 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
84 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
86 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
87 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
88 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
89 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
90 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
92 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
93 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
94 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
95 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
96 ; GFX90A-TGSPLIT-NEXT: s_nop 0
97 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
98 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
99 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
100 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
101 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
102 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
103 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
104 ; GFX90A-TGSPLIT-NEXT: s_endpgm
106 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
107 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
108 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
109 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
110 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
111 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
112 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
114 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
115 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
116 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
118 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
120 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0:
121 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
122 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
123 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
124 ; GFX940-TGSPLIT-NEXT: s_nop 0
125 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
126 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
128 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
129 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
130 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
131 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
132 ; GFX940-TGSPLIT-NEXT: s_endpgm
134 ; GFX11-WGP-LABEL: flat_nontemporal_load_0:
135 ; GFX11-WGP: ; %bb.0: ; %entry
136 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
137 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
138 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
139 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
141 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
142 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
143 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
144 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
145 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
146 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
147 ; GFX11-WGP-NEXT: s_endpgm
149 ; GFX11-CU-LABEL: flat_nontemporal_load_0:
150 ; GFX11-CU: ; %bb.0: ; %entry
151 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
152 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
153 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
154 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
155 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
156 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
157 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
158 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
159 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
160 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
161 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
162 ; GFX11-CU-NEXT: s_endpgm
164 ; GFX12-WGP-LABEL: flat_nontemporal_load_0:
165 ; GFX12-WGP: ; %bb.0: ; %entry
166 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
167 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
168 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
169 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
170 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
171 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
172 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
173 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
174 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
175 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
176 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
177 ; GFX12-WGP-NEXT: s_endpgm
179 ; GFX12-CU-LABEL: flat_nontemporal_load_0:
180 ; GFX12-CU: ; %bb.0: ; %entry
181 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
182 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
183 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
184 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
185 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
186 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
187 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
188 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
189 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
190 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
191 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
192 ; GFX12-CU-NEXT: s_endpgm
195 %val = load i32, ptr %in, align 4, !nontemporal !0
196 store i32 %val, ptr %out
200 define amdgpu_kernel void @flat_nontemporal_load_1(
201 ; GFX7-LABEL: flat_nontemporal_load_1:
202 ; GFX7: ; %bb.0: ; %entry
203 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
204 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2
205 ; GFX7-NEXT: s_mov_b32 s6, 2
206 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
207 ; GFX7-NEXT: s_mov_b32 s6, 0
208 ; GFX7-NEXT: ; implicit-def: $sgpr6
209 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
210 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
211 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
212 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
213 ; GFX7-NEXT: s_mov_b32 s6, s8
214 ; GFX7-NEXT: v_mov_b32_e32 v0, v1
215 ; GFX7-NEXT: s_mov_b32 s8, s9
216 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
217 ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
218 ; GFX7-NEXT: v_mov_b32_e32 v1, s8
219 ; GFX7-NEXT: v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
220 ; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
221 ; GFX7-NEXT: v_mov_b32_e32 v1, v2
222 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc
223 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
224 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
225 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
226 ; GFX7-NEXT: flat_store_dword v[0:1], v2
227 ; GFX7-NEXT: s_endpgm
229 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
230 ; GFX10-WGP: ; %bb.0: ; %entry
231 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
232 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8
233 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2
234 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0
235 ; GFX10-WGP-NEXT: s_mov_b32 s6, 0
236 ; GFX10-WGP-NEXT: ; implicit-def: $sgpr6
237 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
238 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
239 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0
240 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX10-WGP-NEXT: s_mov_b32 s7, s8
242 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v1
243 ; GFX10-WGP-NEXT: s_mov_b32 s6, s9
244 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2
245 ; GFX10-WGP-NEXT: v_add_co_u32 v0, s7, s7, v0
246 ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
247 ; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
248 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v2
249 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
250 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
251 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
252 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
253 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
254 ; GFX10-WGP-NEXT: s_endpgm
256 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
257 ; GFX10-CU: ; %bb.0: ; %entry
258 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
259 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8
260 ; GFX10-CU-NEXT: s_mov_b32 s6, 2
261 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0
262 ; GFX10-CU-NEXT: s_mov_b32 s6, 0
263 ; GFX10-CU-NEXT: ; implicit-def: $sgpr6
264 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
265 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
266 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0
267 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX10-CU-NEXT: s_mov_b32 s7, s8
269 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v1
270 ; GFX10-CU-NEXT: s_mov_b32 s6, s9
271 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2
272 ; GFX10-CU-NEXT: v_add_co_u32 v0, s7, s7, v0
273 ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
274 ; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
275 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v2
276 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
277 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
278 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
279 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
280 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
281 ; GFX10-CU-NEXT: s_endpgm
283 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
284 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
285 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
286 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
287 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2
288 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0
289 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0
290 ; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr2
291 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
292 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
293 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
294 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
295 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s4
296 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v1
297 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s5
298 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
299 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[2:3], s2, v0
300 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
301 ; SKIP-CACHE-INV-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
302 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
303 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
304 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc
305 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
306 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
307 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
308 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
309 ; SKIP-CACHE-INV-NEXT: s_endpgm
311 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
312 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
313 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
314 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8
315 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
316 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6
317 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2
318 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0
319 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0
320 ; GFX90A-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr6
321 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
322 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
323 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
324 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
325 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, s8
326 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, v2
327 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, s9
328 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, v3
329 ; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e64 v0, s[6:7], s6, v0
330 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s8
331 ; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7]
332 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
333 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2
334 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
335 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
336 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
337 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
338 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
340 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
341 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
342 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
343 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8
344 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
345 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6
346 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2
347 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0
348 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0
349 ; GFX90A-TGSPLIT-NEXT: ; implicit-def: $sgpr6
350 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
351 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
352 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
353 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
354 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, s8
355 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, v2
356 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, s9
357 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v3
358 ; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e64 v0, s[6:7], s6, v0
359 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s8
360 ; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7]
361 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
362 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2
363 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
364 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
365 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
366 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
367 ; GFX90A-TGSPLIT-NEXT: s_endpgm
369 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
370 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
371 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
372 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
373 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
374 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
375 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff
376 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4
377 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2
378 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0
379 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0
380 ; GFX940-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr4
381 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
382 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
383 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2
384 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
386 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
387 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
388 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
389 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
390 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
392 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1:
393 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
394 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
395 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
396 ; GFX940-TGSPLIT-NEXT: s_nop 0
397 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
398 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff
399 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4
400 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 2
401 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0
402 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 0
403 ; GFX940-TGSPLIT-NEXT: ; implicit-def: $sgpr4
404 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
405 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
406 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2
407 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
409 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
410 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
411 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
412 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
413 ; GFX940-TGSPLIT-NEXT: s_endpgm
415 ; GFX11-WGP-LABEL: flat_nontemporal_load_1:
416 ; GFX11-WGP: ; %bb.0: ; %entry
417 ; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
418 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
419 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
420 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2
421 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2
422 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
423 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0
424 ; GFX11-WGP-NEXT: ; implicit-def: $sgpr2
425 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
426 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
427 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0
428 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
429 ; GFX11-WGP-NEXT: s_mov_b32 s3, s4
430 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v1
431 ; GFX11-WGP-NEXT: s_mov_b32 s2, s5
432 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2
433 ; GFX11-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
434 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
435 ; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
436 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v2
437 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
438 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
439 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
440 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
441 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
442 ; GFX11-WGP-NEXT: s_endpgm
444 ; GFX11-CU-LABEL: flat_nontemporal_load_1:
445 ; GFX11-CU: ; %bb.0: ; %entry
446 ; GFX11-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
447 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
448 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
449 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2
450 ; GFX11-CU-NEXT: s_mov_b32 s2, 2
451 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
452 ; GFX11-CU-NEXT: s_mov_b32 s2, 0
453 ; GFX11-CU-NEXT: ; implicit-def: $sgpr2
454 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
455 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
456 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0
457 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
458 ; GFX11-CU-NEXT: s_mov_b32 s3, s4
459 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v1
460 ; GFX11-CU-NEXT: s_mov_b32 s2, s5
461 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2
462 ; GFX11-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
463 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
464 ; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
465 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v2
466 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
467 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
468 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
469 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
470 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
471 ; GFX11-CU-NEXT: s_endpgm
473 ; GFX12-WGP-LABEL: flat_nontemporal_load_1:
474 ; GFX12-WGP: ; %bb.0: ; %entry
475 ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
476 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
477 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
478 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
479 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2
480 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
481 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0
482 ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
483 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
484 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
485 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
486 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
487 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4
488 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
489 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5
490 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
491 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
492 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
493 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
494 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
495 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
496 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
497 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
498 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
499 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
500 ; GFX12-WGP-NEXT: s_endpgm
502 ; GFX12-CU-LABEL: flat_nontemporal_load_1:
503 ; GFX12-CU: ; %bb.0: ; %entry
504 ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
505 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
506 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
507 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
508 ; GFX12-CU-NEXT: s_mov_b32 s2, 2
509 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
510 ; GFX12-CU-NEXT: s_mov_b32 s2, 0
511 ; GFX12-CU-NEXT: ; implicit-def: $sgpr2
512 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
513 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
514 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
515 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
516 ; GFX12-CU-NEXT: s_mov_b32 s3, s4
517 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
518 ; GFX12-CU-NEXT: s_mov_b32 s2, s5
519 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
520 ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
521 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
522 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
523 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
524 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
525 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
526 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
527 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
528 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
529 ; GFX12-CU-NEXT: s_endpgm
532 %tid = call i32 @llvm.amdgcn.workitem.id.x()
533 %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
534 %val = load i32, ptr %val.gep, align 4, !nontemporal !0
535 store i32 %val, ptr %out
539 define amdgpu_kernel void @flat_nontemporal_store_0(
540 ; GFX7-LABEL: flat_nontemporal_store_0:
541 ; GFX7: ; %bb.0: ; %entry
542 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
543 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
544 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
545 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
546 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
547 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
548 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
549 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
550 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
551 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
552 ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
553 ; GFX7-NEXT: s_endpgm
555 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
556 ; GFX10-WGP: ; %bb.0: ; %entry
557 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
558 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
559 ; GFX10-WGP-NEXT: s_nop 0
560 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
561 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
563 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
564 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
565 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
566 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
567 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
568 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
569 ; GFX10-WGP-NEXT: s_endpgm
571 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
572 ; GFX10-CU: ; %bb.0: ; %entry
573 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
574 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
575 ; GFX10-CU-NEXT: s_nop 0
576 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
577 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
578 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
579 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
580 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
581 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
582 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
583 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
584 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
585 ; GFX10-CU-NEXT: s_endpgm
587 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
588 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
589 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
590 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
591 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
592 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
593 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
594 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
595 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
596 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
597 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
598 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
599 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc
600 ; SKIP-CACHE-INV-NEXT: s_endpgm
602 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
603 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
604 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
605 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
606 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
607 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
608 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
609 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
610 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
611 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
612 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
613 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
614 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
616 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
617 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
618 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
619 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
620 ; GFX90A-TGSPLIT-NEXT: s_nop 0
621 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
622 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
623 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
624 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
625 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
626 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
627 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
628 ; GFX90A-TGSPLIT-NEXT: s_endpgm
630 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
631 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
632 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
633 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
634 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
635 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
636 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
637 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
638 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
639 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
640 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
641 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 nt sc1
642 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
644 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0:
645 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
646 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
647 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
648 ; GFX940-TGSPLIT-NEXT: s_nop 0
649 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
650 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
651 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
652 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
653 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
654 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
655 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 nt sc1
656 ; GFX940-TGSPLIT-NEXT: s_endpgm
658 ; GFX11-WGP-LABEL: flat_nontemporal_store_0:
659 ; GFX11-WGP: ; %bb.0: ; %entry
660 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
661 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
662 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
663 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
664 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
665 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
666 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
667 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
668 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
669 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
670 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
671 ; GFX11-WGP-NEXT: s_endpgm
673 ; GFX11-CU-LABEL: flat_nontemporal_store_0:
674 ; GFX11-CU: ; %bb.0: ; %entry
675 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
676 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
677 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
678 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
679 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
680 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
681 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
682 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
683 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
684 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
685 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
686 ; GFX11-CU-NEXT: s_endpgm
688 ; GFX12-WGP-LABEL: flat_nontemporal_store_0:
689 ; GFX12-WGP: ; %bb.0: ; %entry
690 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
691 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
692 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
693 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
694 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
695 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
696 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
697 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
698 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
699 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
700 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
701 ; GFX12-WGP-NEXT: s_endpgm
703 ; GFX12-CU-LABEL: flat_nontemporal_store_0:
704 ; GFX12-CU: ; %bb.0: ; %entry
705 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
706 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
707 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
708 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
709 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
710 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
711 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
712 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
713 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
714 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
715 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
716 ; GFX12-CU-NEXT: s_endpgm
719 %val = load i32, ptr %in, align 4
720 store i32 %val, ptr %out, !nontemporal !0
724 define amdgpu_kernel void @flat_nontemporal_store_1(
725 ; GFX7-LABEL: flat_nontemporal_store_1:
726 ; GFX7: ; %bb.0: ; %entry
727 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
728 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
729 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
731 ; GFX7-NEXT: v_mov_b32_e32 v2, s5
732 ; GFX7-NEXT: flat_load_dword v2, v[1:2]
733 ; GFX7-NEXT: s_mov_b32 s4, 2
734 ; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0
735 ; GFX7-NEXT: s_mov_b32 s4, 0
736 ; GFX7-NEXT: ; implicit-def: $sgpr4
737 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
738 ; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
739 ; GFX7-NEXT: v_mov_b32_e32 v4, v0
740 ; GFX7-NEXT: s_mov_b32 s4, s6
741 ; GFX7-NEXT: v_mov_b32_e32 v0, v3
742 ; GFX7-NEXT: s_mov_b32 s6, s7
743 ; GFX7-NEXT: v_mov_b32_e32 v3, v4
744 ; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0
745 ; GFX7-NEXT: v_mov_b32_e32 v1, s6
746 ; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v1, v3, s[4:5]
747 ; GFX7-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
748 ; GFX7-NEXT: v_mov_b32_e32 v1, v3
749 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
750 ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
751 ; GFX7-NEXT: s_endpgm
753 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
754 ; GFX10-WGP: ; %bb.0: ; %entry
755 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
756 ; GFX10-WGP-NEXT: s_nop 0
757 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
758 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
759 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
760 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5
761 ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
762 ; GFX10-WGP-NEXT: s_mov_b32 s4, 2
763 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0
764 ; GFX10-WGP-NEXT: s_mov_b32 s4, 0
765 ; GFX10-WGP-NEXT: ; implicit-def: $sgpr4
766 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
767 ; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
768 ; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0
769 ; GFX10-WGP-NEXT: s_mov_b32 s5, s6
770 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, v3
771 ; GFX10-WGP-NEXT: s_mov_b32 s4, s7
772 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v4
773 ; GFX10-WGP-NEXT: v_add_co_u32 v0, s5, s5, v0
774 ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
775 ; GFX10-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
776 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3
777 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
778 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
779 ; GFX10-WGP-NEXT: s_endpgm
781 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
782 ; GFX10-CU: ; %bb.0: ; %entry
783 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
784 ; GFX10-CU-NEXT: s_nop 0
785 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
786 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
787 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
788 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5
789 ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
790 ; GFX10-CU-NEXT: s_mov_b32 s4, 2
791 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0
792 ; GFX10-CU-NEXT: s_mov_b32 s4, 0
793 ; GFX10-CU-NEXT: ; implicit-def: $sgpr4
794 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
795 ; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
796 ; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0
797 ; GFX10-CU-NEXT: s_mov_b32 s5, s6
798 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, v3
799 ; GFX10-CU-NEXT: s_mov_b32 s4, s7
800 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v4
801 ; GFX10-CU-NEXT: v_add_co_u32 v0, s5, s5, v0
802 ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
803 ; GFX10-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
804 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3
805 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
806 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
807 ; GFX10-CU-NEXT: s_endpgm
809 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
810 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
811 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
812 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
813 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
814 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
815 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
816 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2]
817 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2
818 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0
819 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0
820 ; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr0
821 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
822 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
823 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0
824 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s2
825 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, v3
826 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s3
827 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v4
828 ; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0
829 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2
830 ; SKIP-CACHE-INV-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1]
831 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
832 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v3
833 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
834 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc
835 ; SKIP-CACHE-INV-NEXT: s_endpgm
837 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
838 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
839 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
840 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
841 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
842 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
843 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
844 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
845 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff
846 ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4
847 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2
848 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0
849 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0
850 ; GFX90A-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr4
851 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
852 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
853 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v5, v0
854 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, s6
855 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, v4
856 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, s7
857 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v5
858 ; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e64 v0, s[4:5], s4, v0
859 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
860 ; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
861 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
862 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
863 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
864 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
865 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
867 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
868 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
869 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
870 ; GFX90A-TGSPLIT-NEXT: s_nop 0
871 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
872 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
874 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
875 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff
876 ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4
877 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 2
878 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0
879 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 0
880 ; GFX90A-TGSPLIT-NEXT: ; implicit-def: $sgpr4
881 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
882 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
883 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v0
884 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, s6
885 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, v4
886 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, s7
887 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v5
888 ; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e64 v0, s[4:5], s4, v0
889 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
890 ; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
891 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
892 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
893 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
894 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
895 ; GFX90A-TGSPLIT-NEXT: s_endpgm
897 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
898 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
899 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
900 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
901 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
902 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
903 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
904 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
905 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
906 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
907 ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2
908 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2
909 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0
910 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0
911 ; GFX940-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr2
912 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, 0
913 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
914 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
915 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
916 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
917 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 nt sc1
918 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
920 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1:
921 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
922 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
923 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
924 ; GFX940-TGSPLIT-NEXT: s_nop 0
925 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
926 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
927 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
928 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
929 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
930 ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2
931 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 2
932 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0
933 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0
934 ; GFX940-TGSPLIT-NEXT: ; implicit-def: $sgpr2
935 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, 0
936 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
937 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
938 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
939 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
940 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 nt sc1
941 ; GFX940-TGSPLIT-NEXT: s_endpgm
943 ; GFX11-WGP-LABEL: flat_nontemporal_store_1:
944 ; GFX11-WGP: ; %bb.0: ; %entry
945 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
946 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
947 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
948 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
949 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
950 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
951 ; GFX11-WGP-NEXT: s_mov_b32 s0, 0x3ff
952 ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0
953 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2
954 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
955 ; GFX11-WGP-NEXT: s_mov_b32 s0, 0
956 ; GFX11-WGP-NEXT: ; implicit-def: $sgpr0
957 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
958 ; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
959 ; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0
960 ; GFX11-WGP-NEXT: s_mov_b32 s1, s2
961 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, v3
962 ; GFX11-WGP-NEXT: s_mov_b32 s0, s3
963 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v4
964 ; GFX11-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
965 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
966 ; GFX11-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
967 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3
968 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
969 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
970 ; GFX11-WGP-NEXT: s_endpgm
972 ; GFX11-CU-LABEL: flat_nontemporal_store_1:
973 ; GFX11-CU: ; %bb.0: ; %entry
974 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
975 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
976 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
977 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
978 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
979 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
980 ; GFX11-CU-NEXT: s_mov_b32 s0, 0x3ff
981 ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0
982 ; GFX11-CU-NEXT: s_mov_b32 s0, 2
983 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
984 ; GFX11-CU-NEXT: s_mov_b32 s0, 0
985 ; GFX11-CU-NEXT: ; implicit-def: $sgpr0
986 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
987 ; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
988 ; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0
989 ; GFX11-CU-NEXT: s_mov_b32 s1, s2
990 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, v3
991 ; GFX11-CU-NEXT: s_mov_b32 s0, s3
992 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v4
993 ; GFX11-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
994 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
995 ; GFX11-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
996 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3
997 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
998 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
999 ; GFX11-CU-NEXT: s_endpgm
1001 ; GFX12-WGP-LABEL: flat_nontemporal_store_1:
1002 ; GFX12-WGP: ; %bb.0: ; %entry
1003 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1004 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
1005 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1006 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
1007 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
1008 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
1009 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
1010 ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
1011 ; GFX12-WGP-NEXT: s_mov_b32 s0, 2
1012 ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
1013 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0
1014 ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
1015 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
1016 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
1017 ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
1018 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2
1019 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
1020 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3
1021 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
1022 ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
1023 ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
1024 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
1025 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
1026 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
1027 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
1028 ; GFX12-WGP-NEXT: s_endpgm
1030 ; GFX12-CU-LABEL: flat_nontemporal_store_1:
1031 ; GFX12-CU: ; %bb.0: ; %entry
1032 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1033 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
1034 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1035 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1
1036 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
1037 ; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
1038 ; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
1039 ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
1040 ; GFX12-CU-NEXT: s_mov_b32 s0, 2
1041 ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
1042 ; GFX12-CU-NEXT: s_mov_b32 s0, 0
1043 ; GFX12-CU-NEXT: ; implicit-def: $sgpr0
1044 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
1045 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
1046 ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
1047 ; GFX12-CU-NEXT: s_mov_b32 s1, s2
1048 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
1049 ; GFX12-CU-NEXT: s_mov_b32 s0, s3
1050 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
1051 ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
1052 ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
1053 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
1054 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
1055 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
1056 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
1057 ; GFX12-CU-NEXT: s_endpgm
1058 ptr %in, ptr %out) {
1060 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1061 %val = load i32, ptr %in, align 4
1062 %out.gep = getelementptr inbounds i32, ptr %out, i32 %tid
1063 store i32 %val, ptr %out.gep, !nontemporal !0
1067 define amdgpu_kernel void @flat_nontemporal_volatile_load(
1068 ; GFX7-LABEL: flat_nontemporal_volatile_load:
1069 ; GFX7: ; %bb.0: ; %entry
1070 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
1071 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1072 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
1073 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1074 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1075 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1076 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
1077 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1078 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
1079 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
1080 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1081 ; GFX7-NEXT: flat_store_dword v[0:1], v2
1082 ; GFX7-NEXT: s_endpgm
1084 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
1085 ; GFX10-WGP: ; %bb.0: ; %entry
1086 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
1087 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1088 ; GFX10-WGP-NEXT: s_nop 0
1089 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
1090 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1091 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1092 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1093 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
1094 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
1095 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
1096 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
1097 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1098 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
1099 ; GFX10-WGP-NEXT: s_endpgm
1101 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
1102 ; GFX10-CU: ; %bb.0: ; %entry
1103 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
1104 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1105 ; GFX10-CU-NEXT: s_nop 0
1106 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
1107 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1108 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1109 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1110 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
1111 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
1112 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
1113 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
1114 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1115 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
1116 ; GFX10-CU-NEXT: s_endpgm
1118 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
1119 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1120 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
1121 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1122 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
1123 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1124 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1125 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1126 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
1127 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
1128 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
1129 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
1130 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1131 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
1132 ; SKIP-CACHE-INV-NEXT: s_endpgm
1134 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
1135 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1136 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
1137 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1138 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
1139 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
1140 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1141 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1142 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
1143 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
1144 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1145 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1146 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
1147 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1149 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
1150 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1151 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
1152 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1153 ; GFX90A-TGSPLIT-NEXT: s_nop 0
1154 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
1155 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1156 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1157 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
1158 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1159 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
1160 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
1161 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1163 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
1164 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1165 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
1166 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1167 ; GFX940-NOTTGSPLIT-NEXT: s_nop 0
1168 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
1169 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1170 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1171 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
1172 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
1173 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1174 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1175 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
1176 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1178 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
1179 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1180 ; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
1181 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
1182 ; GFX940-TGSPLIT-NEXT: s_nop 0
1183 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
1184 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1185 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1186 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
1187 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1188 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1189 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
1190 ; GFX940-TGSPLIT-NEXT: s_endpgm
1192 ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
1193 ; GFX11-WGP: ; %bb.0: ; %entry
1194 ; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
1195 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1196 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1197 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1198 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1199 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1200 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
1201 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
1202 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
1203 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
1204 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1205 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
1206 ; GFX11-WGP-NEXT: s_endpgm
1208 ; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
1209 ; GFX11-CU: ; %bb.0: ; %entry
1210 ; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
1211 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1212 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1213 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1214 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1215 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1216 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
1217 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
1218 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
1219 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
1220 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1221 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
1222 ; GFX11-CU-NEXT: s_endpgm
1224 ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
1225 ; GFX12-WGP: ; %bb.0: ; %entry
1226 ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
1227 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1228 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1229 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1230 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1231 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1232 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
1233 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1234 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1235 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
1236 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
1237 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
1238 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
1239 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
1240 ; GFX12-WGP-NEXT: s_endpgm
1242 ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
1243 ; GFX12-CU: ; %bb.0: ; %entry
1244 ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
1245 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
1246 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
1247 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1248 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1249 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1250 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
1251 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
1252 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0
1253 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
1254 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
1255 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
1256 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1257 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
1258 ; GFX12-CU-NEXT: s_endpgm
1259 ptr %in, ptr %out) {
1261 %val = load volatile i32, ptr %in, align 4, !nontemporal !0
1262 store i32 %val, ptr %out
1267 declare i32 @llvm.amdgcn.workitem.id.x()