1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
13 define amdgpu_kernel void @flat_nontemporal_load_0(
14 ; GFX7-LABEL: flat_nontemporal_load_0:
15 ; GFX7: ; %bb.0: ; %entry
16 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
17 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
19 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
20 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc
21 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
22 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
23 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
24 ; GFX7-NEXT: flat_store_dword v[0:1], v2
27 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
28 ; GFX10-WGP: ; %bb.0: ; %entry
29 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
30 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
31 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
32 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
33 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
34 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
35 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
36 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
37 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
38 ; GFX10-WGP-NEXT: s_endpgm
40 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
41 ; GFX10-CU: ; %bb.0: ; %entry
42 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
43 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
44 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
45 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
46 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
47 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
48 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
49 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
50 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
51 ; GFX10-CU-NEXT: s_endpgm
53 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
54 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
55 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
56 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
57 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
58 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
59 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc
60 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
61 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
62 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
63 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
64 ; SKIP-CACHE-INV-NEXT: s_endpgm
66 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
67 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
68 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
69 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
71 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
72 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
73 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
74 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
75 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
76 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
77 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
79 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
80 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
81 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
82 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
83 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
84 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
85 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
86 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
87 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
88 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
89 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
90 ; GFX90A-TGSPLIT-NEXT: s_endpgm
92 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
93 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
94 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
95 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
97 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
98 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
99 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
100 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
101 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
102 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
103 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
105 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0:
106 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
107 ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
108 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
110 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
111 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
112 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
113 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
114 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
115 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
116 ; GFX940-TGSPLIT-NEXT: s_endpgm
118 ; GFX11-WGP-LABEL: flat_nontemporal_load_0:
119 ; GFX11-WGP: ; %bb.0: ; %entry
120 ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
121 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
122 ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
123 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
124 ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
125 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
126 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
127 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
128 ; GFX11-WGP-NEXT: s_endpgm
130 ; GFX11-CU-LABEL: flat_nontemporal_load_0:
131 ; GFX11-CU: ; %bb.0: ; %entry
132 ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
133 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
135 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
136 ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
137 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
138 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
139 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
140 ; GFX11-CU-NEXT: s_endpgm
141 i32* %in, i32* %out) {
143 %val = load i32, i32* %in, align 4, !nontemporal !0
144 store i32 %val, i32* %out
148 define amdgpu_kernel void @flat_nontemporal_load_1(
149 ; GFX7-LABEL: flat_nontemporal_load_1:
150 ; GFX7: ; %bb.0: ; %entry
151 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
152 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
153 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
155 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
156 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
157 ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc slc
158 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
159 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
160 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
161 ; GFX7-NEXT: flat_store_dword v[0:1], v2
162 ; GFX7-NEXT: s_endpgm
164 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
165 ; GFX10-WGP: ; %bb.0: ; %entry
166 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
167 ; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
168 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
169 ; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
170 ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
171 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
172 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
173 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
174 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
175 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
176 ; GFX10-WGP-NEXT: s_endpgm
178 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
179 ; GFX10-CU: ; %bb.0: ; %entry
180 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
181 ; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
182 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
184 ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
185 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
186 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
187 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
188 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
189 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
190 ; GFX10-CU-NEXT: s_endpgm
192 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
193 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
194 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
195 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
196 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
197 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
198 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
199 ; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
200 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc slc
201 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
202 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
203 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
204 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
205 ; SKIP-CACHE-INV-NEXT: s_endpgm
207 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
208 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
209 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
210 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
211 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
213 ; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
214 ; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
215 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
216 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
217 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
218 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
219 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
220 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
222 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
223 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
224 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
225 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
226 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
228 ; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
229 ; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
230 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc
231 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
232 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
233 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
234 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
235 ; GFX90A-TGSPLIT-NEXT: s_endpgm
237 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
238 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
239 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
240 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
241 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
242 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
243 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
244 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
245 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
246 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
247 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
248 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
249 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
251 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1:
252 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
253 ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
254 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
255 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
256 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
257 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
258 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
259 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
260 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
261 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
262 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
263 ; GFX940-TGSPLIT-NEXT: s_endpgm
265 ; GFX11-WGP-LABEL: flat_nontemporal_load_1:
266 ; GFX11-WGP: ; %bb.0: ; %entry
267 ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
268 ; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
269 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
271 ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
272 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
273 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
274 ; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
275 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
276 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
277 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
278 ; GFX11-WGP-NEXT: s_endpgm
280 ; GFX11-CU-LABEL: flat_nontemporal_load_1:
281 ; GFX11-CU: ; %bb.0: ; %entry
282 ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
283 ; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
286 ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
287 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
288 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
289 ; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
290 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
291 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
292 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
293 ; GFX11-CU-NEXT: s_endpgm
294 i32* %in, i32* %out) {
296 %tid = call i32 @llvm.amdgcn.workitem.id.x()
297 %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
298 %val = load i32, i32* %val.gep, align 4, !nontemporal !0
299 store i32 %val, i32* %out
303 define amdgpu_kernel void @flat_nontemporal_store_0(
304 ; GFX7-LABEL: flat_nontemporal_store_0:
305 ; GFX7: ; %bb.0: ; %entry
306 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
307 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
308 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
309 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
310 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
311 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
312 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
313 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
314 ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
315 ; GFX7-NEXT: s_endpgm
317 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
318 ; GFX10-WGP: ; %bb.0: ; %entry
319 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
320 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
321 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
322 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
323 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
324 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
325 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
326 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
327 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
328 ; GFX10-WGP-NEXT: s_endpgm
330 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
331 ; GFX10-CU: ; %bb.0: ; %entry
332 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
333 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
334 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
335 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
336 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
337 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
338 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
339 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
340 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
341 ; GFX10-CU-NEXT: s_endpgm
343 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
344 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
345 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
346 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
347 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
348 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
349 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
350 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
351 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
352 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
353 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc
354 ; SKIP-CACHE-INV-NEXT: s_endpgm
356 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
357 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
358 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
359 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
361 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
362 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
363 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
364 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
365 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
366 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
367 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
369 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
370 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
371 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
372 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
373 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
374 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
375 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
376 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
377 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
378 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
379 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
380 ; GFX90A-TGSPLIT-NEXT: s_endpgm
382 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
383 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
384 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
385 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
387 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
388 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
389 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
390 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
391 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
392 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
393 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
395 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0:
396 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
397 ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
398 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
400 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
401 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
402 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
403 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
404 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
405 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
406 ; GFX940-TGSPLIT-NEXT: s_endpgm
408 ; GFX11-WGP-LABEL: flat_nontemporal_store_0:
409 ; GFX11-WGP: ; %bb.0: ; %entry
410 ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
411 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
412 ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
413 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
414 ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
415 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
416 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
417 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
418 ; GFX11-WGP-NEXT: s_endpgm
420 ; GFX11-CU-LABEL: flat_nontemporal_store_0:
421 ; GFX11-CU: ; %bb.0: ; %entry
422 ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
423 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
425 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
426 ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
427 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
428 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
429 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
430 ; GFX11-CU-NEXT: s_endpgm
431 i32* %in, i32* %out) {
433 %val = load i32, i32* %in, align 4
434 store i32 %val, i32* %out, !nontemporal !0
438 define amdgpu_kernel void @flat_nontemporal_store_1(
439 ; GFX7-LABEL: flat_nontemporal_store_1:
440 ; GFX7: ; %bb.0: ; %entry
441 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
442 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
443 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
445 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
446 ; GFX7-NEXT: flat_load_dword v2, v[1:2]
447 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
448 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
449 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
450 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
451 ; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc
452 ; GFX7-NEXT: s_endpgm
454 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
455 ; GFX10-WGP: ; %bb.0: ; %entry
456 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
457 ; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
458 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
460 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
461 ; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
462 ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
463 ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
464 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
465 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
466 ; GFX10-WGP-NEXT: s_endpgm
468 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
469 ; GFX10-CU: ; %bb.0: ; %entry
470 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
471 ; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
472 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
473 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
474 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
475 ; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
476 ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
477 ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
478 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
479 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
480 ; GFX10-CU-NEXT: s_endpgm
482 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
483 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
484 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
485 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
486 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
487 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
488 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
489 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2]
490 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
491 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
492 ; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
493 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
494 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc
495 ; SKIP-CACHE-INV-NEXT: s_endpgm
497 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
498 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
499 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
500 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
501 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
503 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1
504 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
505 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
506 ; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
507 ; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
508 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
509 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
510 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
512 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
513 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
514 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
515 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
516 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
517 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
518 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1
519 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
520 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
521 ; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
522 ; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
523 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
524 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
525 ; GFX90A-TGSPLIT-NEXT: s_endpgm
527 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
528 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
529 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
530 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
531 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
532 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
533 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
534 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1
535 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
536 ; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
537 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
538 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
539 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
541 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1:
542 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
543 ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
544 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
545 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
546 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
547 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
548 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1
549 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
550 ; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
551 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
552 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
553 ; GFX940-TGSPLIT-NEXT: s_endpgm
555 ; GFX11-WGP-LABEL: flat_nontemporal_store_1:
556 ; GFX11-WGP: ; %bb.0: ; %entry
557 ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
558 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
559 ; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
560 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
561 ; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2)
562 ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
563 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
564 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
565 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
566 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
567 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
568 ; GFX11-WGP-NEXT: s_endpgm
570 ; GFX11-CU-LABEL: flat_nontemporal_store_1:
571 ; GFX11-CU: ; %bb.0: ; %entry
572 ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
573 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
574 ; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
575 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
576 ; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2)
577 ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
578 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
579 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
580 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
581 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
582 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
583 ; GFX11-CU-NEXT: s_endpgm
584 i32* %in, i32* %out) {
586 %tid = call i32 @llvm.amdgcn.workitem.id.x()
587 %val = load i32, i32* %in, align 4
588 %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
589 store i32 %val, i32* %out.gep, !nontemporal !0
594 declare i32 @llvm.amdgcn.workitem.id.x()