1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
5 ; Due to high register pressure, regalloc would split the liverange of wwm VGPR register used for SGPR spills
6 ; and introduce a copy. The copy should be of whole-wave with exec mask manipulation around it.
7 ; FIXME: The destination register involved in the whole-wave copy should be considered for preserving all the lanes
8 ; with a spill/restore at function prolog/epilog. The copy might otherwise clobber its inactive lanes unwantedly.
9 define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
10 ; GFX906-LABEL: preserve_wwm_copy_dstreg:
12 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13 ; GFX906-NEXT: s_mov_b32 s16, s33
14 ; GFX906-NEXT: s_mov_b32 s33, s32
15 ; GFX906-NEXT: s_xor_saveexec_b64 s[18:19], -1
16 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
17 ; GFX906-NEXT: s_mov_b64 exec, -1
18 ; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
19 ; GFX906-NEXT: s_mov_b64 exec, s[18:19]
20 ; GFX906-NEXT: s_mov_b32 s21, s15
21 ; GFX906-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
22 ; GFX906-NEXT: s_mov_b32 s22, s14
23 ; GFX906-NEXT: v_writelane_b32 v39, s21, 0
24 ; GFX906-NEXT: v_writelane_b32 v39, s22, 1
25 ; GFX906-NEXT: s_mov_b32 s23, s13
26 ; GFX906-NEXT: v_writelane_b32 v39, s23, 2
27 ; GFX906-NEXT: s_mov_b32 s24, s12
28 ; GFX906-NEXT: v_writelane_b32 v39, s24, 3
29 ; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11]
30 ; GFX906-NEXT: v_writelane_b32 v39, s26, 4
31 ; GFX906-NEXT: v_writelane_b32 v39, s27, 5
32 ; GFX906-NEXT: v_writelane_b32 v39, s8, 6
33 ; GFX906-NEXT: v_writelane_b32 v41, s16, 4
34 ; GFX906-NEXT: v_writelane_b32 v39, s9, 7
35 ; GFX906-NEXT: v_writelane_b32 v41, s34, 2
36 ; GFX906-NEXT: v_writelane_b32 v39, s6, 8
37 ; GFX906-NEXT: v_writelane_b32 v41, s35, 3
38 ; GFX906-NEXT: v_writelane_b32 v39, s7, 9
39 ; GFX906-NEXT: v_writelane_b32 v41, s30, 0
40 ; GFX906-NEXT: v_writelane_b32 v39, s4, 10
41 ; GFX906-NEXT: s_addk_i32 s32, 0x2800
42 ; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
43 ; GFX906-NEXT: v_writelane_b32 v41, s31, 1
44 ; GFX906-NEXT: v_mov_b32_e32 v32, v31
45 ; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
46 ; GFX906-NEXT: s_nop 0
47 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
48 ; GFX906-NEXT: v_writelane_b32 v39, s5, 11
49 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
50 ; GFX906-NEXT: s_mov_b64 exec, s[34:35]
51 ; GFX906-NEXT: ;;#ASMSTART
52 ; GFX906-NEXT: ; def v[0:31]
53 ; GFX906-NEXT: ;;#ASMEND
54 ; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
55 ; GFX906-NEXT: s_nop 0
56 ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
57 ; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
58 ; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
59 ; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
60 ; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
61 ; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
62 ; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
63 ; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
64 ; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
65 ; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
66 ; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
67 ; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
68 ; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
69 ; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
70 ; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
71 ; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
72 ; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
73 ; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
74 ; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
75 ; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
76 ; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
77 ; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
78 ; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
79 ; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
80 ; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
81 ; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
82 ; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
83 ; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
84 ; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
85 ; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
86 ; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
87 ; GFX906-NEXT: ;;#ASMSTART
88 ; GFX906-NEXT: ; def v40
89 ; GFX906-NEXT: ;;#ASMEND
90 ; GFX906-NEXT: ;;#ASMSTART
91 ; GFX906-NEXT: ; def s11
92 ; GFX906-NEXT: ;;#ASMEND
93 ; GFX906-NEXT: v_writelane_b32 v39, s11, 12
94 ; GFX906-NEXT: ;;#ASMSTART
95 ; GFX906-NEXT: ; def s12
96 ; GFX906-NEXT: ;;#ASMEND
97 ; GFX906-NEXT: v_writelane_b32 v39, s12, 13
98 ; GFX906-NEXT: ;;#ASMSTART
99 ; GFX906-NEXT: ; def s13
100 ; GFX906-NEXT: ;;#ASMEND
101 ; GFX906-NEXT: v_writelane_b32 v39, s13, 14
102 ; GFX906-NEXT: ;;#ASMSTART
103 ; GFX906-NEXT: ; def s14
104 ; GFX906-NEXT: ;;#ASMEND
105 ; GFX906-NEXT: v_writelane_b32 v39, s14, 15
106 ; GFX906-NEXT: ;;#ASMSTART
107 ; GFX906-NEXT: ; def s15
108 ; GFX906-NEXT: ;;#ASMEND
109 ; GFX906-NEXT: v_writelane_b32 v39, s15, 16
110 ; GFX906-NEXT: s_getpc_b64 s[10:11]
111 ; GFX906-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4
112 ; GFX906-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12
113 ; GFX906-NEXT: ;;#ASMSTART
114 ; GFX906-NEXT: ; def s16
115 ; GFX906-NEXT: ;;#ASMEND
116 ; GFX906-NEXT: v_writelane_b32 v39, s16, 17
117 ; GFX906-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0
118 ; GFX906-NEXT: ;;#ASMSTART
119 ; GFX906-NEXT: ; def s17
120 ; GFX906-NEXT: ;;#ASMEND
121 ; GFX906-NEXT: v_writelane_b32 v39, s17, 18
122 ; GFX906-NEXT: ;;#ASMSTART
123 ; GFX906-NEXT: ; def s18
124 ; GFX906-NEXT: ;;#ASMEND
125 ; GFX906-NEXT: v_writelane_b32 v39, s18, 19
126 ; GFX906-NEXT: ;;#ASMSTART
127 ; GFX906-NEXT: ; def s19
128 ; GFX906-NEXT: ;;#ASMEND
129 ; GFX906-NEXT: v_writelane_b32 v39, s19, 20
130 ; GFX906-NEXT: ;;#ASMSTART
131 ; GFX906-NEXT: ; def s20
132 ; GFX906-NEXT: ;;#ASMEND
133 ; GFX906-NEXT: v_writelane_b32 v39, s20, 21
134 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
135 ; GFX906-NEXT: v_writelane_b32 v39, s10, 22
136 ; GFX906-NEXT: v_writelane_b32 v39, s11, 23
137 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
138 ; GFX906-NEXT: s_mov_b64 exec, s[34:35]
139 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
140 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
141 ; GFX906-NEXT: s_mov_b64 exec, s[34:35]
142 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22
143 ; GFX906-NEXT: s_mov_b32 s12, s24
144 ; GFX906-NEXT: s_mov_b32 s13, s23
145 ; GFX906-NEXT: s_mov_b32 s14, s22
146 ; GFX906-NEXT: v_mov_b32_e32 v31, v32
147 ; GFX906-NEXT: s_mov_b32 s15, s21
148 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27]
149 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23
150 ; GFX906-NEXT: v_mov_b32_e32 v40, v32
151 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
152 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
153 ; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
154 ; GFX906-NEXT: s_mov_b64 exec, s[34:35]
155 ; GFX906-NEXT: s_waitcnt vmcnt(0)
156 ; GFX906-NEXT: v_readlane_b32 s11, v39, 12
157 ; GFX906-NEXT: ;;#ASMSTART
158 ; GFX906-NEXT: ; use s11
159 ; GFX906-NEXT: ;;#ASMEND
160 ; GFX906-NEXT: v_readlane_b32 s12, v39, 13
161 ; GFX906-NEXT: ;;#ASMSTART
162 ; GFX906-NEXT: ; use s12
163 ; GFX906-NEXT: ;;#ASMEND
164 ; GFX906-NEXT: v_readlane_b32 s13, v39, 14
165 ; GFX906-NEXT: ;;#ASMSTART
166 ; GFX906-NEXT: ; use s13
167 ; GFX906-NEXT: ;;#ASMEND
168 ; GFX906-NEXT: v_readlane_b32 s14, v39, 15
169 ; GFX906-NEXT: ;;#ASMSTART
170 ; GFX906-NEXT: ; use s14
171 ; GFX906-NEXT: ;;#ASMEND
172 ; GFX906-NEXT: v_readlane_b32 s15, v39, 16
173 ; GFX906-NEXT: ;;#ASMSTART
174 ; GFX906-NEXT: ; use s15
175 ; GFX906-NEXT: ;;#ASMEND
176 ; GFX906-NEXT: v_readlane_b32 s16, v39, 17
177 ; GFX906-NEXT: ;;#ASMSTART
178 ; GFX906-NEXT: ; use s16
179 ; GFX906-NEXT: ;;#ASMEND
180 ; GFX906-NEXT: v_readlane_b32 s17, v39, 18
181 ; GFX906-NEXT: ;;#ASMSTART
182 ; GFX906-NEXT: ; use s17
183 ; GFX906-NEXT: ;;#ASMEND
184 ; GFX906-NEXT: v_readlane_b32 s18, v39, 19
185 ; GFX906-NEXT: ;;#ASMSTART
186 ; GFX906-NEXT: ; use s18
187 ; GFX906-NEXT: ;;#ASMEND
188 ; GFX906-NEXT: v_readlane_b32 s19, v39, 20
189 ; GFX906-NEXT: ;;#ASMSTART
190 ; GFX906-NEXT: ; use s19
191 ; GFX906-NEXT: ;;#ASMEND
192 ; GFX906-NEXT: v_readlane_b32 s20, v39, 21
193 ; GFX906-NEXT: ;;#ASMSTART
194 ; GFX906-NEXT: ; use s20
195 ; GFX906-NEXT: ;;#ASMEND
196 ; GFX906-NEXT: ;;#ASMSTART
197 ; GFX906-NEXT: ; def s21
198 ; GFX906-NEXT: ;;#ASMEND
199 ; GFX906-NEXT: v_writelane_b32 v39, s21, 12
200 ; GFX906-NEXT: ;;#ASMSTART
201 ; GFX906-NEXT: ; def s22
202 ; GFX906-NEXT: ;;#ASMEND
203 ; GFX906-NEXT: v_writelane_b32 v39, s22, 13
204 ; GFX906-NEXT: ;;#ASMSTART
205 ; GFX906-NEXT: ; def s23
206 ; GFX906-NEXT: ;;#ASMEND
207 ; GFX906-NEXT: v_writelane_b32 v39, s23, 14
208 ; GFX906-NEXT: ;;#ASMSTART
209 ; GFX906-NEXT: ; def s24
210 ; GFX906-NEXT: ;;#ASMEND
211 ; GFX906-NEXT: v_writelane_b32 v39, s24, 15
212 ; GFX906-NEXT: ;;#ASMSTART
213 ; GFX906-NEXT: ; def s25
214 ; GFX906-NEXT: ;;#ASMEND
215 ; GFX906-NEXT: v_writelane_b32 v39, s25, 16
216 ; GFX906-NEXT: ;;#ASMSTART
217 ; GFX906-NEXT: ; def s26
218 ; GFX906-NEXT: ;;#ASMEND
219 ; GFX906-NEXT: v_writelane_b32 v39, s26, 17
220 ; GFX906-NEXT: ;;#ASMSTART
221 ; GFX906-NEXT: ; def s27
222 ; GFX906-NEXT: ;;#ASMEND
223 ; GFX906-NEXT: v_writelane_b32 v39, s27, 18
224 ; GFX906-NEXT: ;;#ASMSTART
225 ; GFX906-NEXT: ; def s28
226 ; GFX906-NEXT: ;;#ASMEND
227 ; GFX906-NEXT: v_writelane_b32 v39, s28, 19
228 ; GFX906-NEXT: ;;#ASMSTART
229 ; GFX906-NEXT: ; def s29
230 ; GFX906-NEXT: ;;#ASMEND
231 ; GFX906-NEXT: v_writelane_b32 v39, s29, 20
232 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
233 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
234 ; GFX906-NEXT: s_mov_b64 exec, s[34:35]
235 ; GFX906-NEXT: v_readlane_b32 s4, v39, 10
236 ; GFX906-NEXT: v_readlane_b32 s6, v39, 8
237 ; GFX906-NEXT: v_readlane_b32 s8, v39, 6
238 ; GFX906-NEXT: v_readlane_b32 s10, v39, 4
239 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22
240 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3
241 ; GFX906-NEXT: v_mov_b32_e32 v31, v40
242 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2
243 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1
244 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0
245 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11
246 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9
247 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7
248 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5
249 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23
250 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
251 ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
252 ; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
253 ; GFX906-NEXT: s_mov_b64 exec, s[34:35]
254 ; GFX906-NEXT: s_waitcnt vmcnt(0)
255 ; GFX906-NEXT: v_readlane_b32 s4, v39, 10
256 ; GFX906-NEXT: v_readlane_b32 s6, v39, 8
257 ; GFX906-NEXT: v_readlane_b32 s8, v39, 6
258 ; GFX906-NEXT: v_readlane_b32 s10, v39, 4
259 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22
260 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11
261 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9
262 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7
263 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5
264 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3
265 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2
266 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1
267 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0
268 ; GFX906-NEXT: v_mov_b32_e32 v31, v40
269 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23
270 ; GFX906-NEXT: v_readlane_b32 s21, v39, 12
271 ; GFX906-NEXT: ;;#ASMSTART
272 ; GFX906-NEXT: ; use s21
273 ; GFX906-NEXT: ;;#ASMEND
274 ; GFX906-NEXT: v_readlane_b32 s22, v39, 13
275 ; GFX906-NEXT: ;;#ASMSTART
276 ; GFX906-NEXT: ; use s22
277 ; GFX906-NEXT: ;;#ASMEND
278 ; GFX906-NEXT: v_readlane_b32 s23, v39, 14
279 ; GFX906-NEXT: ;;#ASMSTART
280 ; GFX906-NEXT: ; use s23
281 ; GFX906-NEXT: ;;#ASMEND
282 ; GFX906-NEXT: v_readlane_b32 s24, v39, 15
283 ; GFX906-NEXT: ;;#ASMSTART
284 ; GFX906-NEXT: ; use s24
285 ; GFX906-NEXT: ;;#ASMEND
286 ; GFX906-NEXT: v_readlane_b32 s25, v39, 16
287 ; GFX906-NEXT: ;;#ASMSTART
288 ; GFX906-NEXT: ; use s25
289 ; GFX906-NEXT: ;;#ASMEND
290 ; GFX906-NEXT: v_readlane_b32 s26, v39, 17
291 ; GFX906-NEXT: ;;#ASMSTART
292 ; GFX906-NEXT: ; use s26
293 ; GFX906-NEXT: ;;#ASMEND
294 ; GFX906-NEXT: v_readlane_b32 s27, v39, 18
295 ; GFX906-NEXT: ;;#ASMSTART
296 ; GFX906-NEXT: ; use s27
297 ; GFX906-NEXT: ;;#ASMEND
298 ; GFX906-NEXT: v_readlane_b32 s28, v39, 19
299 ; GFX906-NEXT: ;;#ASMSTART
300 ; GFX906-NEXT: ; use s28
301 ; GFX906-NEXT: ;;#ASMEND
302 ; GFX906-NEXT: v_readlane_b32 s29, v39, 20
303 ; GFX906-NEXT: ;;#ASMSTART
304 ; GFX906-NEXT: ; use s29
305 ; GFX906-NEXT: ;;#ASMEND
306 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
307 ; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
308 ; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
309 ; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
310 ; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
311 ; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
312 ; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
313 ; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
314 ; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
315 ; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
316 ; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
317 ; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
318 ; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
319 ; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
320 ; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
321 ; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
322 ; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
323 ; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
324 ; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
325 ; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
326 ; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
327 ; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
328 ; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
329 ; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
330 ; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
331 ; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
332 ; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
333 ; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
334 ; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
335 ; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
336 ; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
337 ; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
338 ; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
339 ; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
340 ; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
341 ; GFX906-NEXT: v_readlane_b32 s31, v41, 1
342 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0
343 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4
344 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2
345 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3
346 ; GFX906-NEXT: s_waitcnt vmcnt(0)
347 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
348 ; GFX906-NEXT: s_waitcnt vmcnt(0)
349 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96
350 ; GFX906-NEXT: s_waitcnt vmcnt(0)
351 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80
352 ; GFX906-NEXT: s_waitcnt vmcnt(0)
353 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64
354 ; GFX906-NEXT: s_waitcnt vmcnt(0)
355 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48
356 ; GFX906-NEXT: s_waitcnt vmcnt(0)
357 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32
358 ; GFX906-NEXT: s_waitcnt vmcnt(0)
359 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16
360 ; GFX906-NEXT: s_waitcnt vmcnt(0)
361 ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
362 ; GFX906-NEXT: s_waitcnt vmcnt(0)
363 ; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
364 ; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1
365 ; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
366 ; GFX906-NEXT: s_mov_b64 exec, -1
367 ; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
368 ; GFX906-NEXT: s_mov_b64 exec, s[6:7]
369 ; GFX906-NEXT: s_addk_i32 s32, 0xd800
370 ; GFX906-NEXT: s_mov_b32 s33, s4
371 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
372 ; GFX906-NEXT: s_setpc_b64 s[30:31]
374 ; GFX908-LABEL: preserve_wwm_copy_dstreg:
376 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377 ; GFX908-NEXT: s_mov_b32 s16, s33
378 ; GFX908-NEXT: s_mov_b32 s33, s32
379 ; GFX908-NEXT: s_xor_saveexec_b64 s[18:19], -1
380 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
381 ; GFX908-NEXT: s_mov_b64 exec, s[18:19]
382 ; GFX908-NEXT: v_mov_b32_e32 v2, s16
383 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
384 ; GFX908-NEXT: v_mov_b32_e32 v2, s34
385 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
386 ; GFX908-NEXT: v_mov_b32_e32 v2, s35
387 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
388 ; GFX908-NEXT: s_addk_i32 s32, 0x2c00
389 ; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
390 ; GFX908-NEXT: s_mov_b64 s[16:17], exec
391 ; GFX908-NEXT: s_mov_b64 exec, 1
392 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:168
393 ; GFX908-NEXT: v_writelane_b32 v2, s30, 0
394 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
395 ; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168
396 ; GFX908-NEXT: s_waitcnt vmcnt(0)
397 ; GFX908-NEXT: s_mov_b64 exec, s[16:17]
398 ; GFX908-NEXT: s_mov_b64 s[16:17], exec
399 ; GFX908-NEXT: s_mov_b64 exec, 1
400 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:168
401 ; GFX908-NEXT: v_writelane_b32 v2, s31, 0
402 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
403 ; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168
404 ; GFX908-NEXT: s_waitcnt vmcnt(0)
405 ; GFX908-NEXT: s_mov_b64 exec, s[16:17]
406 ; GFX908-NEXT: s_mov_b32 s21, s15
407 ; GFX908-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane
408 ; GFX908-NEXT: s_mov_b32 s22, s14
409 ; GFX908-NEXT: v_writelane_b32 v39, s21, 0
410 ; GFX908-NEXT: v_writelane_b32 v39, s22, 1
411 ; GFX908-NEXT: s_mov_b32 s23, s13
412 ; GFX908-NEXT: v_writelane_b32 v39, s23, 2
413 ; GFX908-NEXT: s_mov_b32 s24, s12
414 ; GFX908-NEXT: v_writelane_b32 v39, s24, 3
415 ; GFX908-NEXT: s_mov_b64 s[26:27], s[10:11]
416 ; GFX908-NEXT: v_writelane_b32 v39, s26, 4
417 ; GFX908-NEXT: v_writelane_b32 v39, s27, 5
418 ; GFX908-NEXT: v_writelane_b32 v39, s8, 6
419 ; GFX908-NEXT: v_writelane_b32 v39, s9, 7
420 ; GFX908-NEXT: v_writelane_b32 v39, s6, 8
421 ; GFX908-NEXT: v_writelane_b32 v39, s7, 9
422 ; GFX908-NEXT: v_writelane_b32 v39, s4, 10
423 ; GFX908-NEXT: v_mov_b32_e32 v32, v31
424 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
425 ; GFX908-NEXT: s_nop 0
426 ; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
427 ; GFX908-NEXT: v_writelane_b32 v39, s5, 11
428 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
429 ; GFX908-NEXT: s_mov_b64 exec, s[34:35]
430 ; GFX908-NEXT: ;;#ASMSTART
431 ; GFX908-NEXT: ; def v[0:31]
432 ; GFX908-NEXT: ;;#ASMEND
433 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
434 ; GFX908-NEXT: s_nop 0
435 ; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
436 ; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
437 ; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
438 ; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
439 ; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
440 ; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
441 ; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
442 ; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
443 ; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
444 ; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
445 ; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
446 ; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
447 ; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
448 ; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
449 ; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
450 ; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
451 ; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
452 ; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
453 ; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
454 ; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
455 ; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
456 ; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
457 ; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
458 ; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
459 ; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
460 ; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
461 ; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
462 ; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
463 ; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
464 ; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
465 ; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
466 ; GFX908-NEXT: ;;#ASMSTART
467 ; GFX908-NEXT: ; def v40
468 ; GFX908-NEXT: ;;#ASMEND
469 ; GFX908-NEXT: ;;#ASMSTART
470 ; GFX908-NEXT: ; def s11
471 ; GFX908-NEXT: ;;#ASMEND
472 ; GFX908-NEXT: v_writelane_b32 v39, s11, 12
473 ; GFX908-NEXT: ;;#ASMSTART
474 ; GFX908-NEXT: ; def s12
475 ; GFX908-NEXT: ;;#ASMEND
476 ; GFX908-NEXT: v_writelane_b32 v39, s12, 13
477 ; GFX908-NEXT: ;;#ASMSTART
478 ; GFX908-NEXT: ; def s13
479 ; GFX908-NEXT: ;;#ASMEND
480 ; GFX908-NEXT: v_writelane_b32 v39, s13, 14
481 ; GFX908-NEXT: ;;#ASMSTART
482 ; GFX908-NEXT: ; def s14
483 ; GFX908-NEXT: ;;#ASMEND
484 ; GFX908-NEXT: v_writelane_b32 v39, s14, 15
485 ; GFX908-NEXT: ;;#ASMSTART
486 ; GFX908-NEXT: ; def s15
487 ; GFX908-NEXT: ;;#ASMEND
488 ; GFX908-NEXT: v_writelane_b32 v39, s15, 16
489 ; GFX908-NEXT: s_getpc_b64 s[10:11]
490 ; GFX908-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4
491 ; GFX908-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12
492 ; GFX908-NEXT: ;;#ASMSTART
493 ; GFX908-NEXT: ; def s16
494 ; GFX908-NEXT: ;;#ASMEND
495 ; GFX908-NEXT: v_writelane_b32 v39, s16, 17
496 ; GFX908-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0
497 ; GFX908-NEXT: ;;#ASMSTART
498 ; GFX908-NEXT: ; def s17
499 ; GFX908-NEXT: ;;#ASMEND
500 ; GFX908-NEXT: v_writelane_b32 v39, s17, 18
501 ; GFX908-NEXT: ;;#ASMSTART
502 ; GFX908-NEXT: ; def s18
503 ; GFX908-NEXT: ;;#ASMEND
504 ; GFX908-NEXT: v_writelane_b32 v39, s18, 19
505 ; GFX908-NEXT: ;;#ASMSTART
506 ; GFX908-NEXT: ; def s19
507 ; GFX908-NEXT: ;;#ASMEND
508 ; GFX908-NEXT: v_writelane_b32 v39, s19, 20
509 ; GFX908-NEXT: ;;#ASMSTART
510 ; GFX908-NEXT: ; def s20
511 ; GFX908-NEXT: ;;#ASMEND
512 ; GFX908-NEXT: v_writelane_b32 v39, s20, 21
513 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX908-NEXT: v_writelane_b32 v39, s10, 22
515 ; GFX908-NEXT: v_writelane_b32 v39, s11, 23
516 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
517 ; GFX908-NEXT: s_mov_b64 exec, s[34:35]
518 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
519 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
520 ; GFX908-NEXT: s_mov_b64 exec, s[34:35]
521 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22
522 ; GFX908-NEXT: s_mov_b32 s12, s24
523 ; GFX908-NEXT: s_mov_b32 s13, s23
524 ; GFX908-NEXT: s_mov_b32 s14, s22
525 ; GFX908-NEXT: v_mov_b32_e32 v31, v32
526 ; GFX908-NEXT: s_mov_b32 s15, s21
527 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27]
528 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23
529 ; GFX908-NEXT: v_mov_b32_e32 v40, v32
530 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
531 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
532 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
533 ; GFX908-NEXT: s_mov_b64 exec, s[34:35]
534 ; GFX908-NEXT: s_waitcnt vmcnt(0)
535 ; GFX908-NEXT: v_readlane_b32 s11, v39, 12
536 ; GFX908-NEXT: ;;#ASMSTART
537 ; GFX908-NEXT: ; use s11
538 ; GFX908-NEXT: ;;#ASMEND
539 ; GFX908-NEXT: v_readlane_b32 s12, v39, 13
540 ; GFX908-NEXT: ;;#ASMSTART
541 ; GFX908-NEXT: ; use s12
542 ; GFX908-NEXT: ;;#ASMEND
543 ; GFX908-NEXT: v_readlane_b32 s13, v39, 14
544 ; GFX908-NEXT: ;;#ASMSTART
545 ; GFX908-NEXT: ; use s13
546 ; GFX908-NEXT: ;;#ASMEND
547 ; GFX908-NEXT: v_readlane_b32 s14, v39, 15
548 ; GFX908-NEXT: ;;#ASMSTART
549 ; GFX908-NEXT: ; use s14
550 ; GFX908-NEXT: ;;#ASMEND
551 ; GFX908-NEXT: v_readlane_b32 s15, v39, 16
552 ; GFX908-NEXT: ;;#ASMSTART
553 ; GFX908-NEXT: ; use s15
554 ; GFX908-NEXT: ;;#ASMEND
555 ; GFX908-NEXT: v_readlane_b32 s16, v39, 17
556 ; GFX908-NEXT: ;;#ASMSTART
557 ; GFX908-NEXT: ; use s16
558 ; GFX908-NEXT: ;;#ASMEND
559 ; GFX908-NEXT: v_readlane_b32 s17, v39, 18
560 ; GFX908-NEXT: ;;#ASMSTART
561 ; GFX908-NEXT: ; use s17
562 ; GFX908-NEXT: ;;#ASMEND
563 ; GFX908-NEXT: v_readlane_b32 s18, v39, 19
564 ; GFX908-NEXT: ;;#ASMSTART
565 ; GFX908-NEXT: ; use s18
566 ; GFX908-NEXT: ;;#ASMEND
567 ; GFX908-NEXT: v_readlane_b32 s19, v39, 20
568 ; GFX908-NEXT: ;;#ASMSTART
569 ; GFX908-NEXT: ; use s19
570 ; GFX908-NEXT: ;;#ASMEND
571 ; GFX908-NEXT: v_readlane_b32 s20, v39, 21
572 ; GFX908-NEXT: ;;#ASMSTART
573 ; GFX908-NEXT: ; use s20
574 ; GFX908-NEXT: ;;#ASMEND
575 ; GFX908-NEXT: ;;#ASMSTART
576 ; GFX908-NEXT: ; def s21
577 ; GFX908-NEXT: ;;#ASMEND
578 ; GFX908-NEXT: v_writelane_b32 v39, s21, 12
579 ; GFX908-NEXT: ;;#ASMSTART
580 ; GFX908-NEXT: ; def s22
581 ; GFX908-NEXT: ;;#ASMEND
582 ; GFX908-NEXT: v_writelane_b32 v39, s22, 13
583 ; GFX908-NEXT: ;;#ASMSTART
584 ; GFX908-NEXT: ; def s23
585 ; GFX908-NEXT: ;;#ASMEND
586 ; GFX908-NEXT: v_writelane_b32 v39, s23, 14
587 ; GFX908-NEXT: ;;#ASMSTART
588 ; GFX908-NEXT: ; def s24
589 ; GFX908-NEXT: ;;#ASMEND
590 ; GFX908-NEXT: v_writelane_b32 v39, s24, 15
591 ; GFX908-NEXT: ;;#ASMSTART
592 ; GFX908-NEXT: ; def s25
593 ; GFX908-NEXT: ;;#ASMEND
594 ; GFX908-NEXT: v_writelane_b32 v39, s25, 16
595 ; GFX908-NEXT: ;;#ASMSTART
596 ; GFX908-NEXT: ; def s26
597 ; GFX908-NEXT: ;;#ASMEND
598 ; GFX908-NEXT: v_writelane_b32 v39, s26, 17
599 ; GFX908-NEXT: ;;#ASMSTART
600 ; GFX908-NEXT: ; def s27
601 ; GFX908-NEXT: ;;#ASMEND
602 ; GFX908-NEXT: v_writelane_b32 v39, s27, 18
603 ; GFX908-NEXT: ;;#ASMSTART
604 ; GFX908-NEXT: ; def s28
605 ; GFX908-NEXT: ;;#ASMEND
606 ; GFX908-NEXT: v_writelane_b32 v39, s28, 19
607 ; GFX908-NEXT: ;;#ASMSTART
608 ; GFX908-NEXT: ; def s29
609 ; GFX908-NEXT: ;;#ASMEND
610 ; GFX908-NEXT: v_writelane_b32 v39, s29, 20
611 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
612 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
613 ; GFX908-NEXT: s_mov_b64 exec, s[34:35]
614 ; GFX908-NEXT: v_readlane_b32 s4, v39, 10
615 ; GFX908-NEXT: v_readlane_b32 s6, v39, 8
616 ; GFX908-NEXT: v_readlane_b32 s8, v39, 6
617 ; GFX908-NEXT: v_readlane_b32 s10, v39, 4
618 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22
619 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3
620 ; GFX908-NEXT: v_mov_b32_e32 v31, v40
621 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2
622 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1
623 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0
624 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11
625 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9
626 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7
627 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5
628 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23
629 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
630 ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
631 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
632 ; GFX908-NEXT: s_mov_b64 exec, s[34:35]
633 ; GFX908-NEXT: s_waitcnt vmcnt(0)
634 ; GFX908-NEXT: v_readlane_b32 s4, v39, 10
635 ; GFX908-NEXT: v_readlane_b32 s6, v39, 8
636 ; GFX908-NEXT: v_readlane_b32 s8, v39, 6
637 ; GFX908-NEXT: v_readlane_b32 s10, v39, 4
638 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22
639 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11
640 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9
641 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7
642 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5
643 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3
644 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2
645 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1
646 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0
647 ; GFX908-NEXT: v_mov_b32_e32 v31, v40
648 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23
649 ; GFX908-NEXT: v_readlane_b32 s21, v39, 12
650 ; GFX908-NEXT: ;;#ASMSTART
651 ; GFX908-NEXT: ; use s21
652 ; GFX908-NEXT: ;;#ASMEND
653 ; GFX908-NEXT: v_readlane_b32 s22, v39, 13
654 ; GFX908-NEXT: ;;#ASMSTART
655 ; GFX908-NEXT: ; use s22
656 ; GFX908-NEXT: ;;#ASMEND
657 ; GFX908-NEXT: v_readlane_b32 s23, v39, 14
658 ; GFX908-NEXT: ;;#ASMSTART
659 ; GFX908-NEXT: ; use s23
660 ; GFX908-NEXT: ;;#ASMEND
661 ; GFX908-NEXT: v_readlane_b32 s24, v39, 15
662 ; GFX908-NEXT: ;;#ASMSTART
663 ; GFX908-NEXT: ; use s24
664 ; GFX908-NEXT: ;;#ASMEND
665 ; GFX908-NEXT: v_readlane_b32 s25, v39, 16
666 ; GFX908-NEXT: ;;#ASMSTART
667 ; GFX908-NEXT: ; use s25
668 ; GFX908-NEXT: ;;#ASMEND
669 ; GFX908-NEXT: v_readlane_b32 s26, v39, 17
670 ; GFX908-NEXT: ;;#ASMSTART
671 ; GFX908-NEXT: ; use s26
672 ; GFX908-NEXT: ;;#ASMEND
673 ; GFX908-NEXT: v_readlane_b32 s27, v39, 18
674 ; GFX908-NEXT: ;;#ASMSTART
675 ; GFX908-NEXT: ; use s27
676 ; GFX908-NEXT: ;;#ASMEND
677 ; GFX908-NEXT: v_readlane_b32 s28, v39, 19
678 ; GFX908-NEXT: ;;#ASMSTART
679 ; GFX908-NEXT: ; use s28
680 ; GFX908-NEXT: ;;#ASMEND
681 ; GFX908-NEXT: v_readlane_b32 s29, v39, 20
682 ; GFX908-NEXT: ;;#ASMSTART
683 ; GFX908-NEXT: ; use s29
684 ; GFX908-NEXT: ;;#ASMEND
685 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
686 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
687 ; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
688 ; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
689 ; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
690 ; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
691 ; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
692 ; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
693 ; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
694 ; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
695 ; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
696 ; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
697 ; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
698 ; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
699 ; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
700 ; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
701 ; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
702 ; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
703 ; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
704 ; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
705 ; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
706 ; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
707 ; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
708 ; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
709 ; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
710 ; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
711 ; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
712 ; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
713 ; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
714 ; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
715 ; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
716 ; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
717 ; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
718 ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
719 ; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
720 ; GFX908-NEXT: s_mov_b64 s[4:5], exec
721 ; GFX908-NEXT: s_waitcnt vmcnt(0)
722 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
723 ; GFX908-NEXT: s_waitcnt vmcnt(0)
724 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96
725 ; GFX908-NEXT: s_waitcnt vmcnt(0)
726 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80
727 ; GFX908-NEXT: s_waitcnt vmcnt(0)
728 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64
729 ; GFX908-NEXT: s_waitcnt vmcnt(0)
730 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48
731 ; GFX908-NEXT: s_waitcnt vmcnt(0)
732 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32
733 ; GFX908-NEXT: s_waitcnt vmcnt(0)
734 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16
735 ; GFX908-NEXT: s_waitcnt vmcnt(0)
736 ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
737 ; GFX908-NEXT: s_waitcnt vmcnt(0)
738 ; GFX908-NEXT: s_mov_b64 exec, 1
739 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168
740 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
741 ; GFX908-NEXT: s_waitcnt vmcnt(0)
742 ; GFX908-NEXT: v_readlane_b32 s31, v0, 0
743 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168
744 ; GFX908-NEXT: s_waitcnt vmcnt(0)
745 ; GFX908-NEXT: s_mov_b64 exec, s[4:5]
746 ; GFX908-NEXT: s_mov_b64 s[4:5], exec
747 ; GFX908-NEXT: s_mov_b64 exec, 1
748 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168
749 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
750 ; GFX908-NEXT: s_waitcnt vmcnt(0)
751 ; GFX908-NEXT: v_readlane_b32 s30, v0, 0
752 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168
753 ; GFX908-NEXT: s_waitcnt vmcnt(0)
754 ; GFX908-NEXT: s_mov_b64 exec, s[4:5]
755 ; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
756 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
757 ; GFX908-NEXT: s_waitcnt vmcnt(0)
758 ; GFX908-NEXT: v_readfirstlane_b32 s4, v0
759 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
760 ; GFX908-NEXT: s_waitcnt vmcnt(0)
761 ; GFX908-NEXT: v_readfirstlane_b32 s34, v0
762 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
763 ; GFX908-NEXT: s_waitcnt vmcnt(0)
764 ; GFX908-NEXT: v_readfirstlane_b32 s35, v0
765 ; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1
766 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
767 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
768 ; GFX908-NEXT: s_addk_i32 s32, 0xd400
769 ; GFX908-NEXT: s_mov_b32 s33, s4
770 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
771 ; GFX908-NEXT: s_setpc_b64 s[30:31]
772 %vreg0 = call <32 x float> asm sideeffect "; def $0", "=v"()
773 %v40 = call i32 asm sideeffect "; def $0","=${v40}"()
775 %s11 = call i32 asm sideeffect "; def $0","=${s11}"()
776 %s12 = call i32 asm sideeffect "; def $0","=${s12}"()
777 %s13 = call i32 asm sideeffect "; def $0","=${s13}"()
778 %s14 = call i32 asm sideeffect "; def $0","=${s14}"()
779 %s15 = call i32 asm sideeffect "; def $0","=${s15}"()
780 %s16 = call i32 asm sideeffect "; def $0","=${s16}"()
781 %s17 = call i32 asm sideeffect "; def $0","=${s17}"()
782 %s18 = call i32 asm sideeffect "; def $0","=${s18}"()
783 %s19 = call i32 asm sideeffect "; def $0","=${s19}"()
784 %s20 = call i32 asm sideeffect "; def $0","=${s20}"()
786 call void asm sideeffect "; use $0","${s11}"(i32 %s11)
787 call void asm sideeffect "; use $0","${s12}"(i32 %s12)
788 call void asm sideeffect "; use $0","${s13}"(i32 %s13)
789 call void asm sideeffect "; use $0","${s14}"(i32 %s14)
790 call void asm sideeffect "; use $0","${s15}"(i32 %s15)
791 call void asm sideeffect "; use $0","${s16}"(i32 %s16)
792 call void asm sideeffect "; use $0","${s17}"(i32 %s17)
793 call void asm sideeffect "; use $0","${s18}"(i32 %s18)
794 call void asm sideeffect "; use $0","${s19}"(i32 %s19)
795 call void asm sideeffect "; use $0","${s20}"(i32 %s20)
797 %s21 = call i32 asm sideeffect "; def $0","=${s21}"()
798 %s22 = call i32 asm sideeffect "; def $0","=${s22}"()
799 %s23 = call i32 asm sideeffect "; def $0","=${s23}"()
800 %s24 = call i32 asm sideeffect "; def $0","=${s24}"()
801 %s25 = call i32 asm sideeffect "; def $0","=${s25}"()
802 %s26 = call i32 asm sideeffect "; def $0","=${s26}"()
803 %s27 = call i32 asm sideeffect "; def $0","=${s27}"()
804 %s28 = call i32 asm sideeffect "; def $0","=${s28}"()
805 %s29 = call i32 asm sideeffect "; def $0","=${s29}"()
807 call void asm sideeffect "; use $0","${s21}"(i32 %s21)
808 call void asm sideeffect "; use $0","${s22}"(i32 %s22)
809 call void asm sideeffect "; use $0","${s23}"(i32 %s23)
810 call void asm sideeffect "; use $0","${s24}"(i32 %s24)
811 call void asm sideeffect "; use $0","${s25}"(i32 %s25)
812 call void asm sideeffect "; use $0","${s26}"(i32 %s26)
813 call void asm sideeffect "; use $0","${s27}"(i32 %s27)
814 call void asm sideeffect "; use $0","${s28}"(i32 %s28)
815 call void asm sideeffect "; use $0","${s29}"(i32 %s29)
819 store volatile <32 x float> %vreg0, ptr %parg0
826 attributes #0 = { "amdgpu-num-vgpr"="42" "amdgpu-num-sgpr"="40"}
828 !llvm.module.flags = !{!0}
829 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}